LLVM API Documentation
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file defines the interfaces that X86 uses to lower LLVM code into a 00011 // selection DAG. 00012 // 00013 //===----------------------------------------------------------------------===// 00014 00015 #include "X86ISelLowering.h" 00016 #include "Utils/X86ShuffleDecode.h" 00017 #include "X86CallingConv.h" 00018 #include "X86InstrBuilder.h" 00019 #include "X86MachineFunctionInfo.h" 00020 #include "X86TargetMachine.h" 00021 #include "X86TargetObjectFile.h" 00022 #include "llvm/ADT/SmallSet.h" 00023 #include "llvm/ADT/Statistic.h" 00024 #include "llvm/ADT/StringExtras.h" 00025 #include "llvm/ADT/StringSwitch.h" 00026 #include "llvm/ADT/VariadicFunction.h" 00027 #include "llvm/CodeGen/IntrinsicLowering.h" 00028 #include "llvm/CodeGen/MachineFrameInfo.h" 00029 #include "llvm/CodeGen/MachineFunction.h" 00030 #include "llvm/CodeGen/MachineInstrBuilder.h" 00031 #include "llvm/CodeGen/MachineJumpTableInfo.h" 00032 #include "llvm/CodeGen/MachineModuleInfo.h" 00033 #include "llvm/CodeGen/MachineRegisterInfo.h" 00034 #include "llvm/IR/CallSite.h" 00035 #include "llvm/IR/CallingConv.h" 00036 #include "llvm/IR/Constants.h" 00037 #include "llvm/IR/DerivedTypes.h" 00038 #include "llvm/IR/Function.h" 00039 #include "llvm/IR/GlobalAlias.h" 00040 #include "llvm/IR/GlobalVariable.h" 00041 #include "llvm/IR/Instructions.h" 00042 #include "llvm/IR/Intrinsics.h" 00043 #include "llvm/MC/MCAsmInfo.h" 00044 #include "llvm/MC/MCContext.h" 00045 #include "llvm/MC/MCExpr.h" 00046 #include "llvm/MC/MCSymbol.h" 00047 #include "llvm/Support/CommandLine.h" 00048 #include "llvm/Support/Debug.h" 00049 #include "llvm/Support/ErrorHandling.h" 00050 #include "llvm/Support/MathExtras.h" 00051 #include "llvm/Target/TargetOptions.h" 00052 #include "X86IntrinsicsInfo.h" 00053 #include <bitset> 00054 #include <numeric> 00055 #include <cctype> 00056 using namespace llvm; 00057 00058 #define DEBUG_TYPE "x86-isel" 00059 00060 STATISTIC(NumTailCalls, "Number of tail calls"); 00061 00062 static cl::opt<bool> ExperimentalVectorWideningLegalization( 00063 "x86-experimental-vector-widening-legalization", cl::init(false), 00064 cl::desc("Enable an experimental vector type legalization through widening " 00065 "rather than promotion."), 00066 cl::Hidden); 00067 00068 static cl::opt<bool> ExperimentalVectorShuffleLowering( 00069 "x86-experimental-vector-shuffle-lowering", cl::init(false), 00070 cl::desc("Enable an experimental vector shuffle lowering code path."), 00071 cl::Hidden); 00072 00073 // Forward declarations. 00074 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 00075 SDValue V2); 00076 00077 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, 00078 SelectionDAG &DAG, SDLoc dl, 00079 unsigned vectorWidth) { 00080 assert((vectorWidth == 128 || vectorWidth == 256) && 00081 "Unsupported vector width"); 00082 EVT VT = Vec.getValueType(); 00083 EVT ElVT = VT.getVectorElementType(); 00084 unsigned Factor = VT.getSizeInBits()/vectorWidth; 00085 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 00086 VT.getVectorNumElements()/Factor); 00087 00088 // Extract from UNDEF is UNDEF. 00089 if (Vec.getOpcode() == ISD::UNDEF) 00090 return DAG.getUNDEF(ResultVT); 00091 00092 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR 00093 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); 00094 00095 // This is the index of the first element of the vectorWidth-bit chunk 00096 // we want. 00097 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) 00098 * ElemsPerChunk); 00099 00100 // If the input is a buildvector just emit a smaller one. 00101 if (Vec.getOpcode() == ISD::BUILD_VECTOR) 00102 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, 00103 makeArrayRef(Vec->op_begin()+NormalizedIdxVal, 00104 ElemsPerChunk)); 00105 00106 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 00107 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 00108 VecIdx); 00109 00110 return Result; 00111 00112 } 00113 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This 00114 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 00115 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 00116 /// instructions or a simple subregister reference. Idx is an index in the 00117 /// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes 00118 /// lowering EXTRACT_VECTOR_ELT operations easier. 00119 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, 00120 SelectionDAG &DAG, SDLoc dl) { 00121 assert((Vec.getValueType().is256BitVector() || 00122 Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); 00123 return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); 00124 } 00125 00126 /// Generate a DAG to grab 256-bits from a 512-bit vector. 00127 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, 00128 SelectionDAG &DAG, SDLoc dl) { 00129 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); 00130 return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); 00131 } 00132 00133 static SDValue InsertSubVector(SDValue Result, SDValue Vec, 00134 unsigned IdxVal, SelectionDAG &DAG, 00135 SDLoc dl, unsigned vectorWidth) { 00136 assert((vectorWidth == 128 || vectorWidth == 256) && 00137 "Unsupported vector width"); 00138 // Inserting UNDEF is Result 00139 if (Vec.getOpcode() == ISD::UNDEF) 00140 return Result; 00141 EVT VT = Vec.getValueType(); 00142 EVT ElVT = VT.getVectorElementType(); 00143 EVT ResultVT = Result.getValueType(); 00144 00145 // Insert the relevant vectorWidth bits. 00146 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); 00147 00148 // This is the index of the first element of the vectorWidth-bit chunk 00149 // we want. 00150 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) 00151 * ElemsPerChunk); 00152 00153 SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); 00154 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 00155 VecIdx); 00156 } 00157 /// Generate a DAG to put 128-bits into a vector > 128 bits. This 00158 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or 00159 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a 00160 /// simple superregister reference. Idx is an index in the 128 bits 00161 /// we want. It need not be aligned to a 128-bit bounday. That makes 00162 /// lowering INSERT_VECTOR_ELT operations easier. 00163 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, 00164 unsigned IdxVal, SelectionDAG &DAG, 00165 SDLoc dl) { 00166 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); 00167 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); 00168 } 00169 00170 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, 00171 unsigned IdxVal, SelectionDAG &DAG, 00172 SDLoc dl) { 00173 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); 00174 return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); 00175 } 00176 00177 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 00178 /// instructions. This is used because creating CONCAT_VECTOR nodes of 00179 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower 00180 /// large BUILD_VECTORS. 00181 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, 00182 unsigned NumElems, SelectionDAG &DAG, 00183 SDLoc dl) { 00184 SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 00185 return Insert128BitVector(V, V2, NumElems/2, DAG, dl); 00186 } 00187 00188 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, 00189 unsigned NumElems, SelectionDAG &DAG, 00190 SDLoc dl) { 00191 SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); 00192 return Insert256BitVector(V, V2, NumElems/2, DAG, dl); 00193 } 00194 00195 static TargetLoweringObjectFile *createTLOF(const Triple &TT) { 00196 if (TT.isOSBinFormatMachO()) { 00197 if (TT.getArch() == Triple::x86_64) 00198 return new X86_64MachoTargetObjectFile(); 00199 return new TargetLoweringObjectFileMachO(); 00200 } 00201 00202 if (TT.isOSLinux()) 00203 return new X86LinuxTargetObjectFile(); 00204 if (TT.isOSBinFormatELF()) 00205 return new TargetLoweringObjectFileELF(); 00206 if (TT.isKnownWindowsMSVCEnvironment()) 00207 return new X86WindowsTargetObjectFile(); 00208 if (TT.isOSBinFormatCOFF()) 00209 return new TargetLoweringObjectFileCOFF(); 00210 llvm_unreachable("unknown subtarget type"); 00211 } 00212 00213 // FIXME: This should stop caching the target machine as soon as 00214 // we can remove resetOperationActions et al. 00215 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 00216 : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { 00217 Subtarget = &TM.getSubtarget<X86Subtarget>(); 00218 X86ScalarSSEf64 = Subtarget->hasSSE2(); 00219 X86ScalarSSEf32 = Subtarget->hasSSE1(); 00220 TD = getDataLayout(); 00221 00222 resetOperationActions(); 00223 } 00224 00225 void X86TargetLowering::resetOperationActions() { 00226 const TargetMachine &TM = getTargetMachine(); 00227 static bool FirstTimeThrough = true; 00228 00229 // If none of the target options have changed, then we don't need to reset the 00230 // operation actions. 00231 if (!FirstTimeThrough && TO == TM.Options) return; 00232 00233 if (!FirstTimeThrough) { 00234 // Reinitialize the actions. 00235 initActions(); 00236 FirstTimeThrough = false; 00237 } 00238 00239 TO = TM.Options; 00240 00241 // Set up the TargetLowering object. 00242 static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 00243 00244 // X86 is weird, it always uses i8 for shift amounts and setcc results. 00245 setBooleanContents(ZeroOrOneBooleanContent); 00246 // X86-SSE is even stranger. It uses -1 or 0 for vector masks. 00247 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 00248 00249 // For 64-bit since we have so many registers use the ILP scheduler, for 00250 // 32-bit code use the register pressure specific scheduling. 00251 // For Atom, always use ILP scheduling. 00252 if (Subtarget->isAtom()) 00253 setSchedulingPreference(Sched::ILP); 00254 else if (Subtarget->is64Bit()) 00255 setSchedulingPreference(Sched::ILP); 00256 else 00257 setSchedulingPreference(Sched::RegPressure); 00258 const X86RegisterInfo *RegInfo = 00259 TM.getSubtarget<X86Subtarget>().getRegisterInfo(); 00260 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); 00261 00262 // Bypass expensive divides on Atom when compiling with O2 00263 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { 00264 addBypassSlowDiv(32, 8); 00265 if (Subtarget->is64Bit()) 00266 addBypassSlowDiv(64, 16); 00267 } 00268 00269 if (Subtarget->isTargetKnownWindowsMSVC()) { 00270 // Setup Windows compiler runtime calls. 00271 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 00272 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 00273 setLibcallName(RTLIB::SREM_I64, "_allrem"); 00274 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 00275 setLibcallName(RTLIB::MUL_I64, "_allmul"); 00276 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 00277 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 00278 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 00279 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 00280 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 00281 00282 // The _ftol2 runtime function has an unusual calling conv, which 00283 // is modeled by a special pseudo-instruction. 00284 setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); 00285 setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); 00286 setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); 00287 setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); 00288 } 00289 00290 if (Subtarget->isTargetDarwin()) { 00291 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 00292 setUseUnderscoreSetJmp(false); 00293 setUseUnderscoreLongJmp(false); 00294 } else if (Subtarget->isTargetWindowsGNU()) { 00295 // MS runtime is weird: it exports _setjmp, but longjmp! 00296 setUseUnderscoreSetJmp(true); 00297 setUseUnderscoreLongJmp(false); 00298 } else { 00299 setUseUnderscoreSetJmp(true); 00300 setUseUnderscoreLongJmp(true); 00301 } 00302 00303 // Set up the register classes. 00304 addRegisterClass(MVT::i8, &X86::GR8RegClass); 00305 addRegisterClass(MVT::i16, &X86::GR16RegClass); 00306 addRegisterClass(MVT::i32, &X86::GR32RegClass); 00307 if (Subtarget->is64Bit()) 00308 addRegisterClass(MVT::i64, &X86::GR64RegClass); 00309 00310 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 00311 00312 // We don't accept any truncstore of integer registers. 00313 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 00314 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 00315 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 00316 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 00317 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 00318 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 00319 00320 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 00321 00322 // SETOEQ and SETUNE require checking two conditions. 00323 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 00324 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 00325 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 00326 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 00327 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 00328 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 00329 00330 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 00331 // operation. 00332 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 00333 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 00334 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 00335 00336 if (Subtarget->is64Bit()) { 00337 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 00338 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 00339 } else if (!TM.Options.UseSoftFloat) { 00340 // We have an algorithm for SSE2->double, and we turn this into a 00341 // 64-bit FILD followed by conditional FADD for other targets. 00342 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 00343 // We have an algorithm for SSE2, and we turn this into a 64-bit 00344 // FILD for other targets. 00345 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 00346 } 00347 00348 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 00349 // this operation. 00350 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 00351 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 00352 00353 if (!TM.Options.UseSoftFloat) { 00354 // SSE has no i16 to fp conversion, only i32 00355 if (X86ScalarSSEf32) { 00356 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 00357 // f32 and f64 cases are Legal, f80 case is not 00358 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 00359 } else { 00360 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 00361 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 00362 } 00363 } else { 00364 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 00365 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 00366 } 00367 00368 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 00369 // are Legal, f80 is custom lowered. 00370 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 00371 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 00372 00373 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 00374 // this operation. 00375 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 00376 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 00377 00378 if (X86ScalarSSEf32) { 00379 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 00380 // f32 and f64 cases are Legal, f80 case is not 00381 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 00382 } else { 00383 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 00384 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 00385 } 00386 00387 // Handle FP_TO_UINT by promoting the destination to a larger signed 00388 // conversion. 00389 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 00390 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 00391 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 00392 00393 if (Subtarget->is64Bit()) { 00394 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 00395 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 00396 } else if (!TM.Options.UseSoftFloat) { 00397 // Since AVX is a superset of SSE3, only check for SSE here. 00398 if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) 00399 // Expand FP_TO_UINT into a select. 00400 // FIXME: We would like to use a Custom expander here eventually to do 00401 // the optimal thing for SSE vs. the default expansion in the legalizer. 00402 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 00403 else 00404 // With SSE3 we can use fisttpll to convert to a signed i64; without 00405 // SSE, we're stuck with a fistpll. 00406 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 00407 } 00408 00409 if (isTargetFTOL()) { 00410 // Use the _ftol2 runtime function, which has a pseudo-instruction 00411 // to handle its weird calling convention. 00412 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); 00413 } 00414 00415 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 00416 if (!X86ScalarSSEf64) { 00417 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 00418 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 00419 if (Subtarget->is64Bit()) { 00420 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 00421 // Without SSE, i64->f64 goes through memory. 00422 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 00423 } 00424 } 00425 00426 // Scalar integer divide and remainder are lowered to use operations that 00427 // produce two results, to match the available instructions. This exposes 00428 // the two-result form to trivial CSE, which is able to combine x/y and x%y 00429 // into a single instruction. 00430 // 00431 // Scalar integer multiply-high is also lowered to use two-result 00432 // operations, to match the available instructions. However, plain multiply 00433 // (low) operations are left as Legal, as there are single-result 00434 // instructions for this in x86. Using the two-result multiply instructions 00435 // when both high and low results are needed must be arranged by dagcombine. 00436 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 00437 MVT VT = IntVTs[i]; 00438 setOperationAction(ISD::MULHS, VT, Expand); 00439 setOperationAction(ISD::MULHU, VT, Expand); 00440 setOperationAction(ISD::SDIV, VT, Expand); 00441 setOperationAction(ISD::UDIV, VT, Expand); 00442 setOperationAction(ISD::SREM, VT, Expand); 00443 setOperationAction(ISD::UREM, VT, Expand); 00444 00445 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 00446 setOperationAction(ISD::ADDC, VT, Custom); 00447 setOperationAction(ISD::ADDE, VT, Custom); 00448 setOperationAction(ISD::SUBC, VT, Custom); 00449 setOperationAction(ISD::SUBE, VT, Custom); 00450 } 00451 00452 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 00453 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 00454 setOperationAction(ISD::BR_CC , MVT::f32, Expand); 00455 setOperationAction(ISD::BR_CC , MVT::f64, Expand); 00456 setOperationAction(ISD::BR_CC , MVT::f80, Expand); 00457 setOperationAction(ISD::BR_CC , MVT::i8, Expand); 00458 setOperationAction(ISD::BR_CC , MVT::i16, Expand); 00459 setOperationAction(ISD::BR_CC , MVT::i32, Expand); 00460 setOperationAction(ISD::BR_CC , MVT::i64, Expand); 00461 setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); 00462 setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); 00463 setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); 00464 setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); 00465 setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); 00466 setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); 00467 setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); 00468 if (Subtarget->is64Bit()) 00469 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 00470 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 00471 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 00472 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 00473 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 00474 setOperationAction(ISD::FREM , MVT::f32 , Expand); 00475 setOperationAction(ISD::FREM , MVT::f64 , Expand); 00476 setOperationAction(ISD::FREM , MVT::f80 , Expand); 00477 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 00478 00479 // Promote the i8 variants and force them on up to i32 which has a shorter 00480 // encoding. 00481 setOperationAction(ISD::CTTZ , MVT::i8 , Promote); 00482 AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); 00483 setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); 00484 AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); 00485 if (Subtarget->hasBMI()) { 00486 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); 00487 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); 00488 if (Subtarget->is64Bit()) 00489 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 00490 } else { 00491 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 00492 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 00493 if (Subtarget->is64Bit()) 00494 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 00495 } 00496 00497 if (Subtarget->hasLZCNT()) { 00498 // When promoting the i8 variants, force them to i32 for a shorter 00499 // encoding. 00500 setOperationAction(ISD::CTLZ , MVT::i8 , Promote); 00501 AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); 00502 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); 00503 AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); 00504 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); 00505 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); 00506 if (Subtarget->is64Bit()) 00507 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 00508 } else { 00509 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 00510 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 00511 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 00512 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); 00513 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); 00514 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); 00515 if (Subtarget->is64Bit()) { 00516 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 00517 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 00518 } 00519 } 00520 00521 // Special handling for half-precision floating point conversions. 00522 // If we don't have F16C support, then lower half float conversions 00523 // into library calls. 00524 if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { 00525 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 00526 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 00527 } 00528 00529 // There's never any support for operations beyond MVT::f32. 00530 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 00531 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); 00532 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 00533 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); 00534 00535 setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); 00536 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 00537 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 00538 setTruncStoreAction(MVT::f80, MVT::f16, Expand); 00539 00540 if (Subtarget->hasPOPCNT()) { 00541 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 00542 } else { 00543 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 00544 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 00545 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 00546 if (Subtarget->is64Bit()) 00547 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 00548 } 00549 00550 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 00551 00552 if (!Subtarget->hasMOVBE()) 00553 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 00554 00555 // These should be promoted to a larger select which is supported. 00556 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 00557 // X86 wants to expand cmov itself. 00558 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 00559 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 00560 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 00561 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 00562 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 00563 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 00564 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 00565 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 00566 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 00567 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 00568 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 00569 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 00570 if (Subtarget->is64Bit()) { 00571 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 00572 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 00573 } 00574 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 00575 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 00576 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 00577 // support continuation, user-level threading, and etc.. As a result, no 00578 // other SjLj exception interfaces are implemented and please don't build 00579 // your own exception handling based on them. 00580 // LLVM/Clang supports zero-cost DWARF exception handling. 00581 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 00582 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 00583 00584 // Darwin ABI issue. 00585 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 00586 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 00587 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 00588 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 00589 if (Subtarget->is64Bit()) 00590 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 00591 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 00592 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 00593 if (Subtarget->is64Bit()) { 00594 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 00595 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 00596 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 00597 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 00598 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 00599 } 00600 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 00601 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 00602 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 00603 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 00604 if (Subtarget->is64Bit()) { 00605 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 00606 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 00607 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 00608 } 00609 00610 if (Subtarget->hasSSE1()) 00611 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 00612 00613 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 00614 00615 // Expand certain atomics 00616 for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { 00617 MVT VT = IntVTs[i]; 00618 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); 00619 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 00620 setOperationAction(ISD::ATOMIC_STORE, VT, Custom); 00621 } 00622 00623 if (Subtarget->hasCmpxchg16b()) { 00624 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); 00625 } 00626 00627 // FIXME - use subtarget debug flags 00628 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && 00629 !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { 00630 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 00631 } 00632 00633 if (Subtarget->is64Bit()) { 00634 setExceptionPointerRegister(X86::RAX); 00635 setExceptionSelectorRegister(X86::RDX); 00636 } else { 00637 setExceptionPointerRegister(X86::EAX); 00638 setExceptionSelectorRegister(X86::EDX); 00639 } 00640 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 00641 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 00642 00643 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 00644 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 00645 00646 setOperationAction(ISD::TRAP, MVT::Other, Legal); 00647 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 00648 00649 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 00650 setOperationAction(ISD::VASTART , MVT::Other, Custom); 00651 setOperationAction(ISD::VAEND , MVT::Other, Expand); 00652 if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { 00653 // TargetInfo::X86_64ABIBuiltinVaList 00654 setOperationAction(ISD::VAARG , MVT::Other, Custom); 00655 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 00656 } else { 00657 // TargetInfo::CharPtrBuiltinVaList 00658 setOperationAction(ISD::VAARG , MVT::Other, Expand); 00659 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 00660 } 00661 00662 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 00663 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 00664 00665 setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); 00666 00667 if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { 00668 // f32 and f64 use SSE. 00669 // Set up the FP register classes. 00670 addRegisterClass(MVT::f32, &X86::FR32RegClass); 00671 addRegisterClass(MVT::f64, &X86::FR64RegClass); 00672 00673 // Use ANDPD to simulate FABS. 00674 setOperationAction(ISD::FABS , MVT::f64, Custom); 00675 setOperationAction(ISD::FABS , MVT::f32, Custom); 00676 00677 // Use XORP to simulate FNEG. 00678 setOperationAction(ISD::FNEG , MVT::f64, Custom); 00679 setOperationAction(ISD::FNEG , MVT::f32, Custom); 00680 00681 // Use ANDPD and ORPD to simulate FCOPYSIGN. 00682 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 00683 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 00684 00685 // Lower this to FGETSIGNx86 plus an AND. 00686 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 00687 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 00688 00689 // We don't support sin/cos/fmod 00690 setOperationAction(ISD::FSIN , MVT::f64, Expand); 00691 setOperationAction(ISD::FCOS , MVT::f64, Expand); 00692 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 00693 setOperationAction(ISD::FSIN , MVT::f32, Expand); 00694 setOperationAction(ISD::FCOS , MVT::f32, Expand); 00695 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 00696 00697 // Expand FP immediates into loads from the stack, except for the special 00698 // cases we handle. 00699 addLegalFPImmediate(APFloat(+0.0)); // xorpd 00700 addLegalFPImmediate(APFloat(+0.0f)); // xorps 00701 } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { 00702 // Use SSE for f32, x87 for f64. 00703 // Set up the FP register classes. 00704 addRegisterClass(MVT::f32, &X86::FR32RegClass); 00705 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 00706 00707 // Use ANDPS to simulate FABS. 00708 setOperationAction(ISD::FABS , MVT::f32, Custom); 00709 00710 // Use XORP to simulate FNEG. 00711 setOperationAction(ISD::FNEG , MVT::f32, Custom); 00712 00713 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 00714 00715 // Use ANDPS and ORPS to simulate FCOPYSIGN. 00716 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 00717 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 00718 00719 // We don't support sin/cos/fmod 00720 setOperationAction(ISD::FSIN , MVT::f32, Expand); 00721 setOperationAction(ISD::FCOS , MVT::f32, Expand); 00722 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 00723 00724 // Special cases we handle for FP constants. 00725 addLegalFPImmediate(APFloat(+0.0f)); // xorps 00726 addLegalFPImmediate(APFloat(+0.0)); // FLD0 00727 addLegalFPImmediate(APFloat(+1.0)); // FLD1 00728 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 00729 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 00730 00731 if (!TM.Options.UnsafeFPMath) { 00732 setOperationAction(ISD::FSIN , MVT::f64, Expand); 00733 setOperationAction(ISD::FCOS , MVT::f64, Expand); 00734 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 00735 } 00736 } else if (!TM.Options.UseSoftFloat) { 00737 // f32 and f64 in x87. 00738 // Set up the FP register classes. 00739 addRegisterClass(MVT::f64, &X86::RFP64RegClass); 00740 addRegisterClass(MVT::f32, &X86::RFP32RegClass); 00741 00742 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 00743 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 00744 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 00745 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 00746 00747 if (!TM.Options.UnsafeFPMath) { 00748 setOperationAction(ISD::FSIN , MVT::f64, Expand); 00749 setOperationAction(ISD::FSIN , MVT::f32, Expand); 00750 setOperationAction(ISD::FCOS , MVT::f64, Expand); 00751 setOperationAction(ISD::FCOS , MVT::f32, Expand); 00752 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 00753 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 00754 } 00755 addLegalFPImmediate(APFloat(+0.0)); // FLD0 00756 addLegalFPImmediate(APFloat(+1.0)); // FLD1 00757 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 00758 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 00759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 00760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 00761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 00762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 00763 } 00764 00765 // We don't support FMA. 00766 setOperationAction(ISD::FMA, MVT::f64, Expand); 00767 setOperationAction(ISD::FMA, MVT::f32, Expand); 00768 00769 // Long double always uses X87. 00770 if (!TM.Options.UseSoftFloat) { 00771 addRegisterClass(MVT::f80, &X86::RFP80RegClass); 00772 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 00773 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 00774 { 00775 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 00776 addLegalFPImmediate(TmpFlt); // FLD0 00777 TmpFlt.changeSign(); 00778 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 00779 00780 bool ignored; 00781 APFloat TmpFlt2(+1.0); 00782 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 00783 &ignored); 00784 addLegalFPImmediate(TmpFlt2); // FLD1 00785 TmpFlt2.changeSign(); 00786 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 00787 } 00788 00789 if (!TM.Options.UnsafeFPMath) { 00790 setOperationAction(ISD::FSIN , MVT::f80, Expand); 00791 setOperationAction(ISD::FCOS , MVT::f80, Expand); 00792 setOperationAction(ISD::FSINCOS, MVT::f80, Expand); 00793 } 00794 00795 setOperationAction(ISD::FFLOOR, MVT::f80, Expand); 00796 setOperationAction(ISD::FCEIL, MVT::f80, Expand); 00797 setOperationAction(ISD::FTRUNC, MVT::f80, Expand); 00798 setOperationAction(ISD::FRINT, MVT::f80, Expand); 00799 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); 00800 setOperationAction(ISD::FMA, MVT::f80, Expand); 00801 } 00802 00803 // Always use a library call for pow. 00804 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 00805 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 00806 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 00807 00808 setOperationAction(ISD::FLOG, MVT::f80, Expand); 00809 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 00810 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 00811 setOperationAction(ISD::FEXP, MVT::f80, Expand); 00812 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 00813 00814 // First set operation action for all vector types to either promote 00815 // (for widening) or expand (for scalarization). Then we will selectively 00816 // turn on ones that can be effectively codegen'd. 00817 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 00818 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 00819 MVT VT = (MVT::SimpleValueType)i; 00820 setOperationAction(ISD::ADD , VT, Expand); 00821 setOperationAction(ISD::SUB , VT, Expand); 00822 setOperationAction(ISD::FADD, VT, Expand); 00823 setOperationAction(ISD::FNEG, VT, Expand); 00824 setOperationAction(ISD::FSUB, VT, Expand); 00825 setOperationAction(ISD::MUL , VT, Expand); 00826 setOperationAction(ISD::FMUL, VT, Expand); 00827 setOperationAction(ISD::SDIV, VT, Expand); 00828 setOperationAction(ISD::UDIV, VT, Expand); 00829 setOperationAction(ISD::FDIV, VT, Expand); 00830 setOperationAction(ISD::SREM, VT, Expand); 00831 setOperationAction(ISD::UREM, VT, Expand); 00832 setOperationAction(ISD::LOAD, VT, Expand); 00833 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 00834 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); 00835 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 00836 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); 00837 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); 00838 setOperationAction(ISD::FABS, VT, Expand); 00839 setOperationAction(ISD::FSIN, VT, Expand); 00840 setOperationAction(ISD::FSINCOS, VT, Expand); 00841 setOperationAction(ISD::FCOS, VT, Expand); 00842 setOperationAction(ISD::FSINCOS, VT, Expand); 00843 setOperationAction(ISD::FREM, VT, Expand); 00844 setOperationAction(ISD::FMA, VT, Expand); 00845 setOperationAction(ISD::FPOWI, VT, Expand); 00846 setOperationAction(ISD::FSQRT, VT, Expand); 00847 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 00848 setOperationAction(ISD::FFLOOR, VT, Expand); 00849 setOperationAction(ISD::FCEIL, VT, Expand); 00850 setOperationAction(ISD::FTRUNC, VT, Expand); 00851 setOperationAction(ISD::FRINT, VT, Expand); 00852 setOperationAction(ISD::FNEARBYINT, VT, Expand); 00853 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 00854 setOperationAction(ISD::MULHS, VT, Expand); 00855 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 00856 setOperationAction(ISD::MULHU, VT, Expand); 00857 setOperationAction(ISD::SDIVREM, VT, Expand); 00858 setOperationAction(ISD::UDIVREM, VT, Expand); 00859 setOperationAction(ISD::FPOW, VT, Expand); 00860 setOperationAction(ISD::CTPOP, VT, Expand); 00861 setOperationAction(ISD::CTTZ, VT, Expand); 00862 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 00863 setOperationAction(ISD::CTLZ, VT, Expand); 00864 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 00865 setOperationAction(ISD::SHL, VT, Expand); 00866 setOperationAction(ISD::SRA, VT, Expand); 00867 setOperationAction(ISD::SRL, VT, Expand); 00868 setOperationAction(ISD::ROTL, VT, Expand); 00869 setOperationAction(ISD::ROTR, VT, Expand); 00870 setOperationAction(ISD::BSWAP, VT, Expand); 00871 setOperationAction(ISD::SETCC, VT, Expand); 00872 setOperationAction(ISD::FLOG, VT, Expand); 00873 setOperationAction(ISD::FLOG2, VT, Expand); 00874 setOperationAction(ISD::FLOG10, VT, Expand); 00875 setOperationAction(ISD::FEXP, VT, Expand); 00876 setOperationAction(ISD::FEXP2, VT, Expand); 00877 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 00878 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 00879 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 00880 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 00881 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); 00882 setOperationAction(ISD::TRUNCATE, VT, Expand); 00883 setOperationAction(ISD::SIGN_EXTEND, VT, Expand); 00884 setOperationAction(ISD::ZERO_EXTEND, VT, Expand); 00885 setOperationAction(ISD::ANY_EXTEND, VT, Expand); 00886 setOperationAction(ISD::VSELECT, VT, Expand); 00887 setOperationAction(ISD::SELECT_CC, VT, Expand); 00888 for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; 00889 InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 00890 setTruncStoreAction(VT, 00891 (MVT::SimpleValueType)InnerVT, Expand); 00892 setLoadExtAction(ISD::SEXTLOAD, VT, Expand); 00893 setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); 00894 00895 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, 00896 // we have to deal with them whether we ask for Expansion or not. Setting 00897 // Expand causes its own optimisation problems though, so leave them legal. 00898 if (VT.getVectorElementType() == MVT::i1) 00899 setLoadExtAction(ISD::EXTLOAD, VT, Expand); 00900 } 00901 00902 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 00903 // with -msoft-float, disable use of MMX as well. 00904 if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { 00905 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); 00906 // No operations on x86mmx supported, everything uses intrinsics. 00907 } 00908 00909 // MMX-sized vectors (other than x86mmx) are expected to be expanded 00910 // into smaller operations. 00911 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 00912 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 00913 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 00914 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 00915 setOperationAction(ISD::AND, MVT::v8i8, Expand); 00916 setOperationAction(ISD::AND, MVT::v4i16, Expand); 00917 setOperationAction(ISD::AND, MVT::v2i32, Expand); 00918 setOperationAction(ISD::AND, MVT::v1i64, Expand); 00919 setOperationAction(ISD::OR, MVT::v8i8, Expand); 00920 setOperationAction(ISD::OR, MVT::v4i16, Expand); 00921 setOperationAction(ISD::OR, MVT::v2i32, Expand); 00922 setOperationAction(ISD::OR, MVT::v1i64, Expand); 00923 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 00924 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 00925 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 00926 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 00927 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 00928 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 00929 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 00930 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 00931 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 00932 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 00933 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 00934 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 00935 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 00936 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 00937 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 00938 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 00939 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 00940 00941 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { 00942 addRegisterClass(MVT::v4f32, &X86::VR128RegClass); 00943 00944 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 00945 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 00946 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 00947 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 00948 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 00949 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 00950 setOperationAction(ISD::FABS, MVT::v4f32, Custom); 00951 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 00952 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 00953 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 00954 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 00955 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 00956 } 00957 00958 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { 00959 addRegisterClass(MVT::v2f64, &X86::VR128RegClass); 00960 00961 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM 00962 // registers cannot be used even for integer operations. 00963 addRegisterClass(MVT::v16i8, &X86::VR128RegClass); 00964 addRegisterClass(MVT::v8i16, &X86::VR128RegClass); 00965 addRegisterClass(MVT::v4i32, &X86::VR128RegClass); 00966 addRegisterClass(MVT::v2i64, &X86::VR128RegClass); 00967 00968 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 00969 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 00970 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 00971 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 00972 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 00973 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 00974 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); 00975 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); 00976 setOperationAction(ISD::MULHU, MVT::v8i16, Legal); 00977 setOperationAction(ISD::MULHS, MVT::v8i16, Legal); 00978 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 00979 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 00980 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 00981 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 00982 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 00983 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 00984 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 00985 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 00986 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 00987 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 00988 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 00989 setOperationAction(ISD::FABS, MVT::v2f64, Custom); 00990 00991 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 00992 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 00993 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 00994 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 00995 00996 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 00997 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 00998 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 00999 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 01000 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 01001 01002 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 01003 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 01004 MVT VT = (MVT::SimpleValueType)i; 01005 // Do not attempt to custom lower non-power-of-2 vectors 01006 if (!isPowerOf2_32(VT.getVectorNumElements())) 01007 continue; 01008 // Do not attempt to custom lower non-128-bit vectors 01009 if (!VT.is128BitVector()) 01010 continue; 01011 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 01012 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 01013 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 01014 } 01015 01016 // We support custom legalizing of sext and anyext loads for specific 01017 // memory vector types which we can load as a scalar (or sequence of 01018 // scalars) and extend in-register to a legal 128-bit vector type. For sext 01019 // loads these must work with a single scalar load. 01020 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom); 01021 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom); 01022 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom); 01023 setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom); 01024 setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom); 01025 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom); 01026 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); 01027 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom); 01028 setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom); 01029 01030 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 01031 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 01032 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 01033 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 01034 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 01035 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 01036 01037 if (Subtarget->is64Bit()) { 01038 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 01039 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 01040 } 01041 01042 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 01043 for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { 01044 MVT VT = (MVT::SimpleValueType)i; 01045 01046 // Do not attempt to promote non-128-bit vectors 01047 if (!VT.is128BitVector()) 01048 continue; 01049 01050 setOperationAction(ISD::AND, VT, Promote); 01051 AddPromotedToType (ISD::AND, VT, MVT::v2i64); 01052 setOperationAction(ISD::OR, VT, Promote); 01053 AddPromotedToType (ISD::OR, VT, MVT::v2i64); 01054 setOperationAction(ISD::XOR, VT, Promote); 01055 AddPromotedToType (ISD::XOR, VT, MVT::v2i64); 01056 setOperationAction(ISD::LOAD, VT, Promote); 01057 AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); 01058 setOperationAction(ISD::SELECT, VT, Promote); 01059 AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); 01060 } 01061 01062 // Custom lower v2i64 and v2f64 selects. 01063 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 01064 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 01065 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 01066 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 01067 01068 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 01069 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 01070 01071 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 01072 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 01073 // As there is no 64-bit GPR available, we need build a special custom 01074 // sequence to convert from v2i32 to v2f32. 01075 if (!Subtarget->is64Bit()) 01076 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); 01077 01078 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); 01079 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); 01080 01081 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); 01082 01083 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); 01084 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); 01085 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); 01086 } 01087 01088 if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { 01089 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 01090 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 01091 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 01092 setOperationAction(ISD::FRINT, MVT::f32, Legal); 01093 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 01094 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 01095 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 01096 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 01097 setOperationAction(ISD::FRINT, MVT::f64, Legal); 01098 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 01099 01100 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 01101 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 01102 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 01103 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 01104 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 01105 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 01106 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 01107 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 01108 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 01109 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 01110 01111 // FIXME: Do we need to handle scalar-to-vector here? 01112 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 01113 01114 setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); 01115 setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); 01116 setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); 01117 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); 01118 setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); 01119 // There is no BLENDI for byte vectors. We don't need to custom lower 01120 // some vselects for now. 01121 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 01122 01123 // SSE41 brings specific instructions for doing vector sign extend even in 01124 // cases where we don't have SRA. 01125 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom); 01126 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom); 01127 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom); 01128 01129 // i8 and i16 vectors are custom because the source register and source 01130 // source memory operand types are not the same width. f32 vectors are 01131 // custom since the immediate controlling the insert encodes additional 01132 // information. 01133 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 01134 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 01135 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 01136 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 01137 01138 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 01139 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 01140 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 01141 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 01142 01143 // FIXME: these should be Legal, but that's only for the case where 01144 // the index is constant. For now custom expand to deal with that. 01145 if (Subtarget->is64Bit()) { 01146 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 01147 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 01148 } 01149 } 01150 01151 if (Subtarget->hasSSE2()) { 01152 setOperationAction(ISD::SRL, MVT::v8i16, Custom); 01153 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 01154 01155 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 01156 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 01157 01158 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 01159 setOperationAction(ISD::SRA, MVT::v16i8, Custom); 01160 01161 // In the customized shift lowering, the legal cases in AVX2 will be 01162 // recognized. 01163 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 01164 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 01165 01166 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 01167 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 01168 01169 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 01170 } 01171 01172 if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { 01173 addRegisterClass(MVT::v32i8, &X86::VR256RegClass); 01174 addRegisterClass(MVT::v16i16, &X86::VR256RegClass); 01175 addRegisterClass(MVT::v8i32, &X86::VR256RegClass); 01176 addRegisterClass(MVT::v8f32, &X86::VR256RegClass); 01177 addRegisterClass(MVT::v4i64, &X86::VR256RegClass); 01178 addRegisterClass(MVT::v4f64, &X86::VR256RegClass); 01179 01180 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 01181 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 01182 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 01183 01184 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 01185 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 01186 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 01187 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 01188 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 01189 setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); 01190 setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); 01191 setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); 01192 setOperationAction(ISD::FRINT, MVT::v8f32, Legal); 01193 setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); 01194 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 01195 setOperationAction(ISD::FABS, MVT::v8f32, Custom); 01196 01197 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 01198 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 01199 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 01200 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 01201 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 01202 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 01203 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 01204 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 01205 setOperationAction(ISD::FRINT, MVT::v4f64, Legal); 01206 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); 01207 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 01208 setOperationAction(ISD::FABS, MVT::v4f64, Custom); 01209 01210 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted 01211 // even though v8i16 is a legal type. 01212 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); 01213 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); 01214 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 01215 01216 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); 01217 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 01218 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); 01219 01220 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 01221 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 01222 01223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); 01224 01225 setOperationAction(ISD::SRL, MVT::v16i16, Custom); 01226 setOperationAction(ISD::SRL, MVT::v32i8, Custom); 01227 01228 setOperationAction(ISD::SHL, MVT::v16i16, Custom); 01229 setOperationAction(ISD::SHL, MVT::v32i8, Custom); 01230 01231 setOperationAction(ISD::SRA, MVT::v16i16, Custom); 01232 setOperationAction(ISD::SRA, MVT::v32i8, Custom); 01233 01234 setOperationAction(ISD::SETCC, MVT::v32i8, Custom); 01235 setOperationAction(ISD::SETCC, MVT::v16i16, Custom); 01236 setOperationAction(ISD::SETCC, MVT::v8i32, Custom); 01237 setOperationAction(ISD::SETCC, MVT::v4i64, Custom); 01238 01239 setOperationAction(ISD::SELECT, MVT::v4f64, Custom); 01240 setOperationAction(ISD::SELECT, MVT::v4i64, Custom); 01241 setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 01242 01243 setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); 01244 setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); 01245 setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); 01246 setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); 01247 01248 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); 01249 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); 01250 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 01251 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); 01252 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); 01253 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); 01254 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); 01255 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); 01256 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); 01257 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 01258 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); 01259 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); 01260 01261 if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { 01262 setOperationAction(ISD::FMA, MVT::v8f32, Legal); 01263 setOperationAction(ISD::FMA, MVT::v4f64, Legal); 01264 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 01265 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 01266 setOperationAction(ISD::FMA, MVT::f32, Legal); 01267 setOperationAction(ISD::FMA, MVT::f64, Legal); 01268 } 01269 01270 if (Subtarget->hasInt256()) { 01271 setOperationAction(ISD::ADD, MVT::v4i64, Legal); 01272 setOperationAction(ISD::ADD, MVT::v8i32, Legal); 01273 setOperationAction(ISD::ADD, MVT::v16i16, Legal); 01274 setOperationAction(ISD::ADD, MVT::v32i8, Legal); 01275 01276 setOperationAction(ISD::SUB, MVT::v4i64, Legal); 01277 setOperationAction(ISD::SUB, MVT::v8i32, Legal); 01278 setOperationAction(ISD::SUB, MVT::v16i16, Legal); 01279 setOperationAction(ISD::SUB, MVT::v32i8, Legal); 01280 01281 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 01282 setOperationAction(ISD::MUL, MVT::v8i32, Legal); 01283 setOperationAction(ISD::MUL, MVT::v16i16, Legal); 01284 // Don't lower v32i8 because there is no 128-bit byte mul 01285 01286 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); 01287 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); 01288 setOperationAction(ISD::MULHU, MVT::v16i16, Legal); 01289 setOperationAction(ISD::MULHS, MVT::v16i16, Legal); 01290 01291 setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); 01292 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); 01293 } else { 01294 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 01295 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 01296 setOperationAction(ISD::ADD, MVT::v16i16, Custom); 01297 setOperationAction(ISD::ADD, MVT::v32i8, Custom); 01298 01299 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 01300 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 01301 setOperationAction(ISD::SUB, MVT::v16i16, Custom); 01302 setOperationAction(ISD::SUB, MVT::v32i8, Custom); 01303 01304 setOperationAction(ISD::MUL, MVT::v4i64, Custom); 01305 setOperationAction(ISD::MUL, MVT::v8i32, Custom); 01306 setOperationAction(ISD::MUL, MVT::v16i16, Custom); 01307 // Don't lower v32i8 because there is no 128-bit byte mul 01308 } 01309 01310 // In the customized shift lowering, the legal cases in AVX2 will be 01311 // recognized. 01312 setOperationAction(ISD::SRL, MVT::v4i64, Custom); 01313 setOperationAction(ISD::SRL, MVT::v8i32, Custom); 01314 01315 setOperationAction(ISD::SHL, MVT::v4i64, Custom); 01316 setOperationAction(ISD::SHL, MVT::v8i32, Custom); 01317 01318 setOperationAction(ISD::SRA, MVT::v8i32, Custom); 01319 01320 // Custom lower several nodes for 256-bit types. 01321 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 01322 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 01323 MVT VT = (MVT::SimpleValueType)i; 01324 01325 // Extract subvector is special because the value type 01326 // (result) is 128-bit but the source is 256-bit wide. 01327 if (VT.is128BitVector()) 01328 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 01329 01330 // Do not attempt to custom lower other non-256-bit vectors 01331 if (!VT.is256BitVector()) 01332 continue; 01333 01334 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 01335 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 01336 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 01337 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 01338 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 01339 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 01340 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 01341 } 01342 01343 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 01344 for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { 01345 MVT VT = (MVT::SimpleValueType)i; 01346 01347 // Do not attempt to promote non-256-bit vectors 01348 if (!VT.is256BitVector()) 01349 continue; 01350 01351 setOperationAction(ISD::AND, VT, Promote); 01352 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 01353 setOperationAction(ISD::OR, VT, Promote); 01354 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 01355 setOperationAction(ISD::XOR, VT, Promote); 01356 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 01357 setOperationAction(ISD::LOAD, VT, Promote); 01358 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 01359 setOperationAction(ISD::SELECT, VT, Promote); 01360 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 01361 } 01362 } 01363 01364 if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { 01365 addRegisterClass(MVT::v16i32, &X86::VR512RegClass); 01366 addRegisterClass(MVT::v16f32, &X86::VR512RegClass); 01367 addRegisterClass(MVT::v8i64, &X86::VR512RegClass); 01368 addRegisterClass(MVT::v8f64, &X86::VR512RegClass); 01369 01370 addRegisterClass(MVT::i1, &X86::VK1RegClass); 01371 addRegisterClass(MVT::v8i1, &X86::VK8RegClass); 01372 addRegisterClass(MVT::v16i1, &X86::VK16RegClass); 01373 01374 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 01375 setOperationAction(ISD::SETCC, MVT::i1, Custom); 01376 setOperationAction(ISD::XOR, MVT::i1, Legal); 01377 setOperationAction(ISD::OR, MVT::i1, Legal); 01378 setOperationAction(ISD::AND, MVT::i1, Legal); 01379 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); 01380 setOperationAction(ISD::LOAD, MVT::v16f32, Legal); 01381 setOperationAction(ISD::LOAD, MVT::v8f64, Legal); 01382 setOperationAction(ISD::LOAD, MVT::v8i64, Legal); 01383 setOperationAction(ISD::LOAD, MVT::v16i32, Legal); 01384 setOperationAction(ISD::LOAD, MVT::v16i1, Legal); 01385 01386 setOperationAction(ISD::FADD, MVT::v16f32, Legal); 01387 setOperationAction(ISD::FSUB, MVT::v16f32, Legal); 01388 setOperationAction(ISD::FMUL, MVT::v16f32, Legal); 01389 setOperationAction(ISD::FDIV, MVT::v16f32, Legal); 01390 setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); 01391 setOperationAction(ISD::FNEG, MVT::v16f32, Custom); 01392 01393 setOperationAction(ISD::FADD, MVT::v8f64, Legal); 01394 setOperationAction(ISD::FSUB, MVT::v8f64, Legal); 01395 setOperationAction(ISD::FMUL, MVT::v8f64, Legal); 01396 setOperationAction(ISD::FDIV, MVT::v8f64, Legal); 01397 setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); 01398 setOperationAction(ISD::FNEG, MVT::v8f64, Custom); 01399 setOperationAction(ISD::FMA, MVT::v8f64, Legal); 01400 setOperationAction(ISD::FMA, MVT::v16f32, Legal); 01401 01402 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 01403 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 01404 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 01405 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 01406 if (Subtarget->is64Bit()) { 01407 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); 01408 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); 01409 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); 01410 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); 01411 } 01412 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); 01413 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); 01414 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); 01415 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 01416 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); 01417 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); 01418 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); 01419 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 01420 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); 01421 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); 01422 01423 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 01424 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); 01425 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); 01426 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); 01427 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); 01428 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); 01429 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); 01430 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); 01431 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); 01432 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); 01433 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); 01434 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); 01435 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); 01436 01437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); 01438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); 01439 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); 01440 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); 01441 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); 01442 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); 01443 01444 setOperationAction(ISD::SETCC, MVT::v16i1, Custom); 01445 setOperationAction(ISD::SETCC, MVT::v8i1, Custom); 01446 01447 setOperationAction(ISD::MUL, MVT::v8i64, Custom); 01448 01449 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); 01450 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); 01451 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); 01452 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); 01453 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); 01454 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); 01455 setOperationAction(ISD::SELECT, MVT::v8f64, Custom); 01456 setOperationAction(ISD::SELECT, MVT::v8i64, Custom); 01457 setOperationAction(ISD::SELECT, MVT::v16f32, Custom); 01458 01459 setOperationAction(ISD::ADD, MVT::v8i64, Legal); 01460 setOperationAction(ISD::ADD, MVT::v16i32, Legal); 01461 01462 setOperationAction(ISD::SUB, MVT::v8i64, Legal); 01463 setOperationAction(ISD::SUB, MVT::v16i32, Legal); 01464 01465 setOperationAction(ISD::MUL, MVT::v16i32, Legal); 01466 01467 setOperationAction(ISD::SRL, MVT::v8i64, Custom); 01468 setOperationAction(ISD::SRL, MVT::v16i32, Custom); 01469 01470 setOperationAction(ISD::SHL, MVT::v8i64, Custom); 01471 setOperationAction(ISD::SHL, MVT::v16i32, Custom); 01472 01473 setOperationAction(ISD::SRA, MVT::v8i64, Custom); 01474 setOperationAction(ISD::SRA, MVT::v16i32, Custom); 01475 01476 setOperationAction(ISD::AND, MVT::v8i64, Legal); 01477 setOperationAction(ISD::OR, MVT::v8i64, Legal); 01478 setOperationAction(ISD::XOR, MVT::v8i64, Legal); 01479 setOperationAction(ISD::AND, MVT::v16i32, Legal); 01480 setOperationAction(ISD::OR, MVT::v16i32, Legal); 01481 setOperationAction(ISD::XOR, MVT::v16i32, Legal); 01482 01483 if (Subtarget->hasCDI()) { 01484 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); 01485 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); 01486 } 01487 01488 // Custom lower several nodes. 01489 for (int i = MVT::FIRST_VECTOR_VALUETYPE; 01490 i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { 01491 MVT VT = (MVT::SimpleValueType)i; 01492 01493 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 01494 // Extract subvector is special because the value type 01495 // (result) is 256/128-bit but the source is 512-bit wide. 01496 if (VT.is128BitVector() || VT.is256BitVector()) 01497 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 01498 01499 if (VT.getVectorElementType() == MVT::i1) 01500 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 01501 01502 // Do not attempt to custom lower other non-512-bit vectors 01503 if (!VT.is512BitVector()) 01504 continue; 01505 01506 if ( EltSize >= 32) { 01507 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 01508 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 01509 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 01510 setOperationAction(ISD::VSELECT, VT, Legal); 01511 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 01512 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); 01513 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 01514 } 01515 } 01516 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 01517 MVT VT = (MVT::SimpleValueType)i; 01518 01519 // Do not attempt to promote non-256-bit vectors 01520 if (!VT.is512BitVector()) 01521 continue; 01522 01523 setOperationAction(ISD::SELECT, VT, Promote); 01524 AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); 01525 } 01526 }// has AVX-512 01527 01528 if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { 01529 addRegisterClass(MVT::v32i16, &X86::VR512RegClass); 01530 addRegisterClass(MVT::v64i8, &X86::VR512RegClass); 01531 01532 addRegisterClass(MVT::v32i1, &X86::VK32RegClass); 01533 addRegisterClass(MVT::v64i1, &X86::VK64RegClass); 01534 01535 setOperationAction(ISD::LOAD, MVT::v32i16, Legal); 01536 setOperationAction(ISD::LOAD, MVT::v64i8, Legal); 01537 setOperationAction(ISD::SETCC, MVT::v32i1, Custom); 01538 setOperationAction(ISD::SETCC, MVT::v64i1, Custom); 01539 01540 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { 01541 const MVT VT = (MVT::SimpleValueType)i; 01542 01543 const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 01544 01545 // Do not attempt to promote non-256-bit vectors 01546 if (!VT.is512BitVector()) 01547 continue; 01548 01549 if ( EltSize < 32) { 01550 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 01551 setOperationAction(ISD::VSELECT, VT, Legal); 01552 } 01553 } 01554 } 01555 01556 if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { 01557 addRegisterClass(MVT::v4i1, &X86::VK4RegClass); 01558 addRegisterClass(MVT::v2i1, &X86::VK2RegClass); 01559 01560 setOperationAction(ISD::SETCC, MVT::v4i1, Custom); 01561 setOperationAction(ISD::SETCC, MVT::v2i1, Custom); 01562 } 01563 01564 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 01565 // of this type with custom code. 01566 for (int VT = MVT::FIRST_VECTOR_VALUETYPE; 01567 VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { 01568 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, 01569 Custom); 01570 } 01571 01572 // We want to custom lower some of our intrinsics. 01573 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 01574 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 01575 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 01576 if (!Subtarget->is64Bit()) 01577 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); 01578 01579 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 01580 // handle type legalization for these operations here. 01581 // 01582 // FIXME: We really should do custom legalization for addition and 01583 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 01584 // than generic legalization for 64-bit multiplication-with-overflow, though. 01585 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 01586 // Add/Sub/Mul with overflow operations are custom lowered. 01587 MVT VT = IntVTs[i]; 01588 setOperationAction(ISD::SADDO, VT, Custom); 01589 setOperationAction(ISD::UADDO, VT, Custom); 01590 setOperationAction(ISD::SSUBO, VT, Custom); 01591 setOperationAction(ISD::USUBO, VT, Custom); 01592 setOperationAction(ISD::SMULO, VT, Custom); 01593 setOperationAction(ISD::UMULO, VT, Custom); 01594 } 01595 01596 // There are no 8-bit 3-address imul/mul instructions 01597 setOperationAction(ISD::SMULO, MVT::i8, Expand); 01598 setOperationAction(ISD::UMULO, MVT::i8, Expand); 01599 01600 if (!Subtarget->is64Bit()) { 01601 // These libcalls are not available in 32-bit. 01602 setLibcallName(RTLIB::SHL_I128, nullptr); 01603 setLibcallName(RTLIB::SRL_I128, nullptr); 01604 setLibcallName(RTLIB::SRA_I128, nullptr); 01605 } 01606 01607 // Combine sin / cos into one node or libcall if possible. 01608 if (Subtarget->hasSinCos()) { 01609 setLibcallName(RTLIB::SINCOS_F32, "sincosf"); 01610 setLibcallName(RTLIB::SINCOS_F64, "sincos"); 01611 if (Subtarget->isTargetDarwin()) { 01612 // For MacOSX, we don't want to the normal expansion of a libcall to 01613 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 01614 // traffic. 01615 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 01616 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 01617 } 01618 } 01619 01620 if (Subtarget->isTargetWin64()) { 01621 setOperationAction(ISD::SDIV, MVT::i128, Custom); 01622 setOperationAction(ISD::UDIV, MVT::i128, Custom); 01623 setOperationAction(ISD::SREM, MVT::i128, Custom); 01624 setOperationAction(ISD::UREM, MVT::i128, Custom); 01625 setOperationAction(ISD::SDIVREM, MVT::i128, Custom); 01626 setOperationAction(ISD::UDIVREM, MVT::i128, Custom); 01627 } 01628 01629 // We have target-specific dag combine patterns for the following nodes: 01630 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 01631 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 01632 setTargetDAGCombine(ISD::VSELECT); 01633 setTargetDAGCombine(ISD::SELECT); 01634 setTargetDAGCombine(ISD::SHL); 01635 setTargetDAGCombine(ISD::SRA); 01636 setTargetDAGCombine(ISD::SRL); 01637 setTargetDAGCombine(ISD::OR); 01638 setTargetDAGCombine(ISD::AND); 01639 setTargetDAGCombine(ISD::ADD); 01640 setTargetDAGCombine(ISD::FADD); 01641 setTargetDAGCombine(ISD::FSUB); 01642 setTargetDAGCombine(ISD::FMA); 01643 setTargetDAGCombine(ISD::SUB); 01644 setTargetDAGCombine(ISD::LOAD); 01645 setTargetDAGCombine(ISD::STORE); 01646 setTargetDAGCombine(ISD::ZERO_EXTEND); 01647 setTargetDAGCombine(ISD::ANY_EXTEND); 01648 setTargetDAGCombine(ISD::SIGN_EXTEND); 01649 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 01650 setTargetDAGCombine(ISD::TRUNCATE); 01651 setTargetDAGCombine(ISD::SINT_TO_FP); 01652 setTargetDAGCombine(ISD::SETCC); 01653 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 01654 setTargetDAGCombine(ISD::BUILD_VECTOR); 01655 if (Subtarget->is64Bit()) 01656 setTargetDAGCombine(ISD::MUL); 01657 setTargetDAGCombine(ISD::XOR); 01658 01659 computeRegisterProperties(); 01660 01661 // On Darwin, -Os means optimize for size without hurting performance, 01662 // do not reduce the limit. 01663 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 01664 MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 01665 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 01666 MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 01667 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 01668 MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 01669 setPrefLoopAlignment(4); // 2^4 bytes. 01670 01671 // Predictable cmov don't hurt on atom because it's in-order. 01672 PredictableSelectIsExpensive = !Subtarget->isAtom(); 01673 01674 setPrefFunctionAlignment(4); // 2^4 bytes. 01675 01676 verifyIntrinsicTables(); 01677 } 01678 01679 // This has so far only been implemented for 64-bit MachO. 01680 bool X86TargetLowering::useLoadStackGuardNode() const { 01681 return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO && 01682 Subtarget->is64Bit(); 01683 } 01684 01685 TargetLoweringBase::LegalizeTypeAction 01686 X86TargetLowering::getPreferredVectorAction(EVT VT) const { 01687 if (ExperimentalVectorWideningLegalization && 01688 VT.getVectorNumElements() != 1 && 01689 VT.getVectorElementType().getSimpleVT() != MVT::i1) 01690 return TypeWidenVector; 01691 01692 return TargetLoweringBase::getPreferredVectorAction(VT); 01693 } 01694 01695 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 01696 if (!VT.isVector()) 01697 return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; 01698 01699 const unsigned NumElts = VT.getVectorNumElements(); 01700 const EVT EltVT = VT.getVectorElementType(); 01701 if (VT.is512BitVector()) { 01702 if (Subtarget->hasAVX512()) 01703 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 01704 EltVT == MVT::f32 || EltVT == MVT::f64) 01705 switch(NumElts) { 01706 case 8: return MVT::v8i1; 01707 case 16: return MVT::v16i1; 01708 } 01709 if (Subtarget->hasBWI()) 01710 if (EltVT == MVT::i8 || EltVT == MVT::i16) 01711 switch(NumElts) { 01712 case 32: return MVT::v32i1; 01713 case 64: return MVT::v64i1; 01714 } 01715 } 01716 01717 if (VT.is256BitVector() || VT.is128BitVector()) { 01718 if (Subtarget->hasVLX()) 01719 if (EltVT == MVT::i32 || EltVT == MVT::i64 || 01720 EltVT == MVT::f32 || EltVT == MVT::f64) 01721 switch(NumElts) { 01722 case 2: return MVT::v2i1; 01723 case 4: return MVT::v4i1; 01724 case 8: return MVT::v8i1; 01725 } 01726 if (Subtarget->hasBWI() && Subtarget->hasVLX()) 01727 if (EltVT == MVT::i8 || EltVT == MVT::i16) 01728 switch(NumElts) { 01729 case 8: return MVT::v8i1; 01730 case 16: return MVT::v16i1; 01731 case 32: return MVT::v32i1; 01732 } 01733 } 01734 01735 return VT.changeVectorElementTypeToInteger(); 01736 } 01737 01738 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine 01739 /// the desired ByVal argument alignment. 01740 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 01741 if (MaxAlign == 16) 01742 return; 01743 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 01744 if (VTy->getBitWidth() == 128) 01745 MaxAlign = 16; 01746 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 01747 unsigned EltAlign = 0; 01748 getMaxByValAlign(ATy->getElementType(), EltAlign); 01749 if (EltAlign > MaxAlign) 01750 MaxAlign = EltAlign; 01751 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 01752 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 01753 unsigned EltAlign = 0; 01754 getMaxByValAlign(STy->getElementType(i), EltAlign); 01755 if (EltAlign > MaxAlign) 01756 MaxAlign = EltAlign; 01757 if (MaxAlign == 16) 01758 break; 01759 } 01760 } 01761 } 01762 01763 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 01764 /// function arguments in the caller parameter area. For X86, aggregates 01765 /// that contain SSE vectors are placed at 16-byte boundaries while the rest 01766 /// are at 4-byte boundaries. 01767 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 01768 if (Subtarget->is64Bit()) { 01769 // Max of 8 and alignment of type. 01770 unsigned TyAlign = TD->getABITypeAlignment(Ty); 01771 if (TyAlign > 8) 01772 return TyAlign; 01773 return 8; 01774 } 01775 01776 unsigned Align = 4; 01777 if (Subtarget->hasSSE1()) 01778 getMaxByValAlign(Ty, Align); 01779 return Align; 01780 } 01781 01782 /// getOptimalMemOpType - Returns the target specific optimal type for load 01783 /// and store operations as a result of memset, memcpy, and memmove 01784 /// lowering. If DstAlign is zero that means it's safe to destination 01785 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 01786 /// means there isn't a need to check it against alignment requirement, 01787 /// probably because the source does not need to be loaded. If 'IsMemset' is 01788 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 01789 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 01790 /// source is constant so it does not need to be loaded. 01791 /// It returns EVT::Other if the type should be determined using generic 01792 /// target-independent logic. 01793 EVT 01794 X86TargetLowering::getOptimalMemOpType(uint64_t Size, 01795 unsigned DstAlign, unsigned SrcAlign, 01796 bool IsMemset, bool ZeroMemset, 01797 bool MemcpyStrSrc, 01798 MachineFunction &MF) const { 01799 const Function *F = MF.getFunction(); 01800 if ((!IsMemset || ZeroMemset) && 01801 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 01802 Attribute::NoImplicitFloat)) { 01803 if (Size >= 16 && 01804 (Subtarget->isUnalignedMemAccessFast() || 01805 ((DstAlign == 0 || DstAlign >= 16) && 01806 (SrcAlign == 0 || SrcAlign >= 16)))) { 01807 if (Size >= 32) { 01808 if (Subtarget->hasInt256()) 01809 return MVT::v8i32; 01810 if (Subtarget->hasFp256()) 01811 return MVT::v8f32; 01812 } 01813 if (Subtarget->hasSSE2()) 01814 return MVT::v4i32; 01815 if (Subtarget->hasSSE1()) 01816 return MVT::v4f32; 01817 } else if (!MemcpyStrSrc && Size >= 8 && 01818 !Subtarget->is64Bit() && 01819 Subtarget->hasSSE2()) { 01820 // Do not use f64 to lower memcpy if source is string constant. It's 01821 // better to use i32 to avoid the loads. 01822 return MVT::f64; 01823 } 01824 } 01825 if (Subtarget->is64Bit() && Size >= 8) 01826 return MVT::i64; 01827 return MVT::i32; 01828 } 01829 01830 bool X86TargetLowering::isSafeMemOpType(MVT VT) const { 01831 if (VT == MVT::f32) 01832 return X86ScalarSSEf32; 01833 else if (VT == MVT::f64) 01834 return X86ScalarSSEf64; 01835 return true; 01836 } 01837 01838 bool 01839 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 01840 unsigned, 01841 unsigned, 01842 bool *Fast) const { 01843 if (Fast) 01844 *Fast = Subtarget->isUnalignedMemAccessFast(); 01845 return true; 01846 } 01847 01848 /// getJumpTableEncoding - Return the entry encoding for a jump table in the 01849 /// current function. The returned value is a member of the 01850 /// MachineJumpTableInfo::JTEntryKind enum. 01851 unsigned X86TargetLowering::getJumpTableEncoding() const { 01852 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 01853 // symbol. 01854 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 01855 Subtarget->isPICStyleGOT()) 01856 return MachineJumpTableInfo::EK_Custom32; 01857 01858 // Otherwise, use the normal jump table encoding heuristics. 01859 return TargetLowering::getJumpTableEncoding(); 01860 } 01861 01862 const MCExpr * 01863 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 01864 const MachineBasicBlock *MBB, 01865 unsigned uid,MCContext &Ctx) const{ 01866 assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && 01867 Subtarget->isPICStyleGOT()); 01868 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 01869 // entries. 01870 return MCSymbolRefExpr::Create(MBB->getSymbol(), 01871 MCSymbolRefExpr::VK_GOTOFF, Ctx); 01872 } 01873 01874 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 01875 /// jumptable. 01876 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 01877 SelectionDAG &DAG) const { 01878 if (!Subtarget->is64Bit()) 01879 // This doesn't have SDLoc associated with it, but is not really the 01880 // same as a Register. 01881 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); 01882 return Table; 01883 } 01884 01885 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 01886 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 01887 /// MCExpr. 01888 const MCExpr *X86TargetLowering:: 01889 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 01890 MCContext &Ctx) const { 01891 // X86-64 uses RIP relative addressing based on the jump table label. 01892 if (Subtarget->isPICStyleRIPRel()) 01893 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 01894 01895 // Otherwise, the reference is relative to the PIC base. 01896 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 01897 } 01898 01899 // FIXME: Why this routine is here? Move to RegInfo! 01900 std::pair<const TargetRegisterClass*, uint8_t> 01901 X86TargetLowering::findRepresentativeClass(MVT VT) const{ 01902 const TargetRegisterClass *RRC = nullptr; 01903 uint8_t Cost = 1; 01904 switch (VT.SimpleTy) { 01905 default: 01906 return TargetLowering::findRepresentativeClass(VT); 01907 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 01908 RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; 01909 break; 01910 case MVT::x86mmx: 01911 RRC = &X86::VR64RegClass; 01912 break; 01913 case MVT::f32: case MVT::f64: 01914 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 01915 case MVT::v4f32: case MVT::v2f64: 01916 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 01917 case MVT::v4f64: 01918 RRC = &X86::VR128RegClass; 01919 break; 01920 } 01921 return std::make_pair(RRC, Cost); 01922 } 01923 01924 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 01925 unsigned &Offset) const { 01926 if (!Subtarget->isTargetLinux()) 01927 return false; 01928 01929 if (Subtarget->is64Bit()) { 01930 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 01931 Offset = 0x28; 01932 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 01933 AddressSpace = 256; 01934 else 01935 AddressSpace = 257; 01936 } else { 01937 // %gs:0x14 on i386 01938 Offset = 0x14; 01939 AddressSpace = 256; 01940 } 01941 return true; 01942 } 01943 01944 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 01945 unsigned DestAS) const { 01946 assert(SrcAS != DestAS && "Expected different address spaces!"); 01947 01948 return SrcAS < 256 && DestAS < 256; 01949 } 01950 01951 //===----------------------------------------------------------------------===// 01952 // Return Value Calling Convention Implementation 01953 //===----------------------------------------------------------------------===// 01954 01955 #include "X86GenCallingConv.inc" 01956 01957 bool 01958 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 01959 MachineFunction &MF, bool isVarArg, 01960 const SmallVectorImpl<ISD::OutputArg> &Outs, 01961 LLVMContext &Context) const { 01962 SmallVector<CCValAssign, 16> RVLocs; 01963 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 01964 return CCInfo.CheckReturn(Outs, RetCC_X86); 01965 } 01966 01967 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { 01968 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; 01969 return ScratchRegs; 01970 } 01971 01972 SDValue 01973 X86TargetLowering::LowerReturn(SDValue Chain, 01974 CallingConv::ID CallConv, bool isVarArg, 01975 const SmallVectorImpl<ISD::OutputArg> &Outs, 01976 const SmallVectorImpl<SDValue> &OutVals, 01977 SDLoc dl, SelectionDAG &DAG) const { 01978 MachineFunction &MF = DAG.getMachineFunction(); 01979 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 01980 01981 SmallVector<CCValAssign, 16> RVLocs; 01982 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 01983 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 01984 01985 SDValue Flag; 01986 SmallVector<SDValue, 6> RetOps; 01987 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 01988 // Operand #1 = Bytes To Pop 01989 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 01990 MVT::i16)); 01991 01992 // Copy the result values into the output registers. 01993 for (unsigned i = 0; i != RVLocs.size(); ++i) { 01994 CCValAssign &VA = RVLocs[i]; 01995 assert(VA.isRegLoc() && "Can only return in registers!"); 01996 SDValue ValToCopy = OutVals[i]; 01997 EVT ValVT = ValToCopy.getValueType(); 01998 01999 // Promote values to the appropriate types 02000 if (VA.getLocInfo() == CCValAssign::SExt) 02001 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); 02002 else if (VA.getLocInfo() == CCValAssign::ZExt) 02003 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); 02004 else if (VA.getLocInfo() == CCValAssign::AExt) 02005 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); 02006 else if (VA.getLocInfo() == CCValAssign::BCvt) 02007 ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); 02008 02009 assert(VA.getLocInfo() != CCValAssign::FPExt && 02010 "Unexpected FP-extend for return value."); 02011 02012 // If this is x86-64, and we disabled SSE, we can't return FP values, 02013 // or SSE or MMX vectors. 02014 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 02015 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 02016 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 02017 report_fatal_error("SSE register return with SSE disabled"); 02018 } 02019 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 02020 // llvm-gcc has never done it right and no one has noticed, so this 02021 // should be OK for now. 02022 if (ValVT == MVT::f64 && 02023 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 02024 report_fatal_error("SSE2 register return with SSE2 disabled"); 02025 02026 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 02027 // the RET instruction and handled by the FP Stackifier. 02028 if (VA.getLocReg() == X86::FP0 || 02029 VA.getLocReg() == X86::FP1) { 02030 // If this is a copy from an xmm register to ST(0), use an FPExtend to 02031 // change the value to the FP stack register class. 02032 if (isScalarFPTypeInSSEReg(VA.getValVT())) 02033 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 02034 RetOps.push_back(ValToCopy); 02035 // Don't emit a copytoreg. 02036 continue; 02037 } 02038 02039 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 02040 // which is returned in RAX / RDX. 02041 if (Subtarget->is64Bit()) { 02042 if (ValVT == MVT::x86mmx) { 02043 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 02044 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 02045 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 02046 ValToCopy); 02047 // If we don't have SSE2 available, convert to v4f32 so the generated 02048 // register is legal. 02049 if (!Subtarget->hasSSE2()) 02050 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 02051 } 02052 } 02053 } 02054 02055 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 02056 Flag = Chain.getValue(1); 02057 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 02058 } 02059 02060 // The x86-64 ABIs require that for returning structs by value we copy 02061 // the sret argument into %rax/%eax (depending on ABI) for the return. 02062 // Win32 requires us to put the sret argument to %eax as well. 02063 // We saved the argument into a virtual register in the entry block, 02064 // so now we copy the value out and into %rax/%eax. 02065 if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && 02066 (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { 02067 MachineFunction &MF = DAG.getMachineFunction(); 02068 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 02069 unsigned Reg = FuncInfo->getSRetReturnReg(); 02070 assert(Reg && 02071 "SRetReturnReg should have been set in LowerFormalArguments()."); 02072 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 02073 02074 unsigned RetValReg 02075 = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? 02076 X86::RAX : X86::EAX; 02077 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); 02078 Flag = Chain.getValue(1); 02079 02080 // RAX/EAX now acts like a return value. 02081 RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); 02082 } 02083 02084 RetOps[0] = Chain; // Update chain. 02085 02086 // Add the flag if we have it. 02087 if (Flag.getNode()) 02088 RetOps.push_back(Flag); 02089 02090 return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); 02091 } 02092 02093 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 02094 if (N->getNumValues() != 1) 02095 return false; 02096 if (!N->hasNUsesOfValue(1, 0)) 02097 return false; 02098 02099 SDValue TCChain = Chain; 02100 SDNode *Copy = *N->use_begin(); 02101 if (Copy->getOpcode() == ISD::CopyToReg) { 02102 // If the copy has a glue operand, we conservatively assume it isn't safe to 02103 // perform a tail call. 02104 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 02105 return false; 02106 TCChain = Copy->getOperand(0); 02107 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 02108 return false; 02109 02110 bool HasRet = false; 02111 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 02112 UI != UE; ++UI) { 02113 if (UI->getOpcode() != X86ISD::RET_FLAG) 02114 return false; 02115 // If we are returning more than one value, we can definitely 02116 // not make a tail call see PR19530 02117 if (UI->getNumOperands() > 4) 02118 return false; 02119 if (UI->getNumOperands() == 4 && 02120 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) 02121 return false; 02122 HasRet = true; 02123 } 02124 02125 if (!HasRet) 02126 return false; 02127 02128 Chain = TCChain; 02129 return true; 02130 } 02131 02132 EVT 02133 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 02134 ISD::NodeType ExtendKind) const { 02135 MVT ReturnMVT; 02136 // TODO: Is this also valid on 32-bit? 02137 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 02138 ReturnMVT = MVT::i8; 02139 else 02140 ReturnMVT = MVT::i32; 02141 02142 EVT MinVT = getRegisterType(Context, ReturnMVT); 02143 return VT.bitsLT(MinVT) ? MinVT : VT; 02144 } 02145 02146 /// LowerCallResult - Lower the result values of a call into the 02147 /// appropriate copies out of appropriate physical registers. 02148 /// 02149 SDValue 02150 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 02151 CallingConv::ID CallConv, bool isVarArg, 02152 const SmallVectorImpl<ISD::InputArg> &Ins, 02153 SDLoc dl, SelectionDAG &DAG, 02154 SmallVectorImpl<SDValue> &InVals) const { 02155 02156 // Assign locations to each value returned by this call. 02157 SmallVector<CCValAssign, 16> RVLocs; 02158 bool Is64Bit = Subtarget->is64Bit(); 02159 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 02160 *DAG.getContext()); 02161 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 02162 02163 // Copy all of the result registers out of their specified physreg. 02164 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 02165 CCValAssign &VA = RVLocs[i]; 02166 EVT CopyVT = VA.getValVT(); 02167 02168 // If this is x86-64, and we disabled SSE, we can't return FP values 02169 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 02170 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 02171 report_fatal_error("SSE register return with SSE disabled"); 02172 } 02173 02174 // If we prefer to use the value in xmm registers, copy it out as f80 and 02175 // use a truncate to move it from fp stack reg to xmm reg. 02176 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && 02177 isScalarFPTypeInSSEReg(VA.getValVT())) 02178 CopyVT = MVT::f80; 02179 02180 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 02181 CopyVT, InFlag).getValue(1); 02182 SDValue Val = Chain.getValue(0); 02183 02184 if (CopyVT != VA.getValVT()) 02185 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 02186 // This truncation won't change the value. 02187 DAG.getIntPtrConstant(1)); 02188 02189 InFlag = Chain.getValue(2); 02190 InVals.push_back(Val); 02191 } 02192 02193 return Chain; 02194 } 02195 02196 //===----------------------------------------------------------------------===// 02197 // C & StdCall & Fast Calling Convention implementation 02198 //===----------------------------------------------------------------------===// 02199 // StdCall calling convention seems to be standard for many Windows' API 02200 // routines and around. It differs from C calling convention just a little: 02201 // callee should clean up the stack, not caller. Symbols should be also 02202 // decorated in some fancy way :) It doesn't support any vector arguments. 02203 // For info on fast calling convention see Fast Calling Convention (tail call) 02204 // implementation LowerX86_32FastCCCallTo. 02205 02206 /// CallIsStructReturn - Determines whether a call uses struct return 02207 /// semantics. 02208 enum StructReturnType { 02209 NotStructReturn, 02210 RegStructReturn, 02211 StackStructReturn 02212 }; 02213 static StructReturnType 02214 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 02215 if (Outs.empty()) 02216 return NotStructReturn; 02217 02218 const ISD::ArgFlagsTy &Flags = Outs[0].Flags; 02219 if (!Flags.isSRet()) 02220 return NotStructReturn; 02221 if (Flags.isInReg()) 02222 return RegStructReturn; 02223 return StackStructReturn; 02224 } 02225 02226 /// ArgsAreStructReturn - Determines whether a function uses struct 02227 /// return semantics. 02228 static StructReturnType 02229 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 02230 if (Ins.empty()) 02231 return NotStructReturn; 02232 02233 const ISD::ArgFlagsTy &Flags = Ins[0].Flags; 02234 if (!Flags.isSRet()) 02235 return NotStructReturn; 02236 if (Flags.isInReg()) 02237 return RegStructReturn; 02238 return StackStructReturn; 02239 } 02240 02241 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 02242 /// by "Src" to address "Dst" with size and alignment information specified by 02243 /// the specific parameter attribute. The copy will be passed as a byval 02244 /// function parameter. 02245 static SDValue 02246 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 02247 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 02248 SDLoc dl) { 02249 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 02250 02251 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 02252 /*isVolatile*/false, /*AlwaysInline=*/true, 02253 MachinePointerInfo(), MachinePointerInfo()); 02254 } 02255 02256 /// IsTailCallConvention - Return true if the calling convention is one that 02257 /// supports tail call optimization. 02258 static bool IsTailCallConvention(CallingConv::ID CC) { 02259 return (CC == CallingConv::Fast || CC == CallingConv::GHC || 02260 CC == CallingConv::HiPE); 02261 } 02262 02263 /// \brief Return true if the calling convention is a C calling convention. 02264 static bool IsCCallConvention(CallingConv::ID CC) { 02265 return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || 02266 CC == CallingConv::X86_64_SysV); 02267 } 02268 02269 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 02270 if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) 02271 return false; 02272 02273 CallSite CS(CI); 02274 CallingConv::ID CalleeCC = CS.getCallingConv(); 02275 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 02276 return false; 02277 02278 return true; 02279 } 02280 02281 /// FuncIsMadeTailCallSafe - Return true if the function is being made into 02282 /// a tailcall target by changing its ABI. 02283 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, 02284 bool GuaranteedTailCallOpt) { 02285 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 02286 } 02287 02288 SDValue 02289 X86TargetLowering::LowerMemArgument(SDValue Chain, 02290 CallingConv::ID CallConv, 02291 const SmallVectorImpl<ISD::InputArg> &Ins, 02292 SDLoc dl, SelectionDAG &DAG, 02293 const CCValAssign &VA, 02294 MachineFrameInfo *MFI, 02295 unsigned i) const { 02296 // Create the nodes corresponding to a load from this parameter slot. 02297 ISD::ArgFlagsTy Flags = Ins[i].Flags; 02298 bool AlwaysUseMutable = FuncIsMadeTailCallSafe( 02299 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); 02300 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 02301 EVT ValVT; 02302 02303 // If value is passed by pointer we have address passed instead of the value 02304 // itself. 02305 if (VA.getLocInfo() == CCValAssign::Indirect) 02306 ValVT = VA.getLocVT(); 02307 else 02308 ValVT = VA.getValVT(); 02309 02310 // FIXME: For now, all byval parameter objects are marked mutable. This can be 02311 // changed with more analysis. 02312 // In case of tail call optimization mark all arguments mutable. Since they 02313 // could be overwritten by lowering of arguments in case of a tail call. 02314 if (Flags.isByVal()) { 02315 unsigned Bytes = Flags.getByValSize(); 02316 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 02317 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 02318 return DAG.getFrameIndex(FI, getPointerTy()); 02319 } else { 02320 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 02321 VA.getLocMemOffset(), isImmutable); 02322 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 02323 return DAG.getLoad(ValVT, dl, Chain, FIN, 02324 MachinePointerInfo::getFixedStack(FI), 02325 false, false, false, 0); 02326 } 02327 } 02328 02329 // FIXME: Get this from tablegen. 02330 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, 02331 const X86Subtarget *Subtarget) { 02332 assert(Subtarget->is64Bit()); 02333 02334 if (Subtarget->isCallingConvWin64(CallConv)) { 02335 static const MCPhysReg GPR64ArgRegsWin64[] = { 02336 X86::RCX, X86::RDX, X86::R8, X86::R9 02337 }; 02338 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); 02339 } 02340 02341 static const MCPhysReg GPR64ArgRegs64Bit[] = { 02342 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 02343 }; 02344 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); 02345 } 02346 02347 // FIXME: Get this from tablegen. 02348 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, 02349 CallingConv::ID CallConv, 02350 const X86Subtarget *Subtarget) { 02351 assert(Subtarget->is64Bit()); 02352 if (Subtarget->isCallingConvWin64(CallConv)) { 02353 // The XMM registers which might contain var arg parameters are shadowed 02354 // in their paired GPR. So we only need to save the GPR to their home 02355 // slots. 02356 // TODO: __vectorcall will change this. 02357 return None; 02358 } 02359 02360 const Function *Fn = MF.getFunction(); 02361 bool NoImplicitFloatOps = Fn->getAttributes(). 02362 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 02363 assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && 02364 "SSE register cannot be used when SSE is disabled!"); 02365 if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || 02366 !Subtarget->hasSSE1()) 02367 // Kernel mode asks for SSE to be disabled, so there are no XMM argument 02368 // registers. 02369 return None; 02370 02371 static const MCPhysReg XMMArgRegs64Bit[] = { 02372 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 02373 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 02374 }; 02375 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); 02376 } 02377 02378 SDValue 02379 X86TargetLowering::LowerFormalArguments(SDValue Chain, 02380 CallingConv::ID CallConv, 02381 bool isVarArg, 02382 const SmallVectorImpl<ISD::InputArg> &Ins, 02383 SDLoc dl, 02384 SelectionDAG &DAG, 02385 SmallVectorImpl<SDValue> &InVals) 02386 const { 02387 MachineFunction &MF = DAG.getMachineFunction(); 02388 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 02389 02390 const Function* Fn = MF.getFunction(); 02391 if (Fn->hasExternalLinkage() && 02392 Subtarget->isTargetCygMing() && 02393 Fn->getName() == "main") 02394 FuncInfo->setForceFramePointer(true); 02395 02396 MachineFrameInfo *MFI = MF.getFrameInfo(); 02397 bool Is64Bit = Subtarget->is64Bit(); 02398 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 02399 02400 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 02401 "Var args not supported with calling convention fastcc, ghc or hipe"); 02402 02403 // Assign locations to all of the incoming arguments. 02404 SmallVector<CCValAssign, 16> ArgLocs; 02405 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 02406 02407 // Allocate shadow area for Win64 02408 if (IsWin64) 02409 CCInfo.AllocateStack(32, 8); 02410 02411 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 02412 02413 unsigned LastVal = ~0U; 02414 SDValue ArgValue; 02415 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 02416 CCValAssign &VA = ArgLocs[i]; 02417 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 02418 // places. 02419 assert(VA.getValNo() != LastVal && 02420 "Don't support value assigned to multiple locs yet"); 02421 (void)LastVal; 02422 LastVal = VA.getValNo(); 02423 02424 if (VA.isRegLoc()) { 02425 EVT RegVT = VA.getLocVT(); 02426 const TargetRegisterClass *RC; 02427 if (RegVT == MVT::i32) 02428 RC = &X86::GR32RegClass; 02429 else if (Is64Bit && RegVT == MVT::i64) 02430 RC = &X86::GR64RegClass; 02431 else if (RegVT == MVT::f32) 02432 RC = &X86::FR32RegClass; 02433 else if (RegVT == MVT::f64) 02434 RC = &X86::FR64RegClass; 02435 else if (RegVT.is512BitVector()) 02436 RC = &X86::VR512RegClass; 02437 else if (RegVT.is256BitVector()) 02438 RC = &X86::VR256RegClass; 02439 else if (RegVT.is128BitVector()) 02440 RC = &X86::VR128RegClass; 02441 else if (RegVT == MVT::x86mmx) 02442 RC = &X86::VR64RegClass; 02443 else if (RegVT == MVT::i1) 02444 RC = &X86::VK1RegClass; 02445 else if (RegVT == MVT::v8i1) 02446 RC = &X86::VK8RegClass; 02447 else if (RegVT == MVT::v16i1) 02448 RC = &X86::VK16RegClass; 02449 else if (RegVT == MVT::v32i1) 02450 RC = &X86::VK32RegClass; 02451 else if (RegVT == MVT::v64i1) 02452 RC = &X86::VK64RegClass; 02453 else 02454 llvm_unreachable("Unknown argument type!"); 02455 02456 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 02457 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 02458 02459 // If this is an 8 or 16-bit value, it is really passed promoted to 32 02460 // bits. Insert an assert[sz]ext to capture this, then truncate to the 02461 // right size. 02462 if (VA.getLocInfo() == CCValAssign::SExt) 02463 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 02464 DAG.getValueType(VA.getValVT())); 02465 else if (VA.getLocInfo() == CCValAssign::ZExt) 02466 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 02467 DAG.getValueType(VA.getValVT())); 02468 else if (VA.getLocInfo() == CCValAssign::BCvt) 02469 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 02470 02471 if (VA.isExtInLoc()) { 02472 // Handle MMX values passed in XMM regs. 02473 if (RegVT.isVector()) 02474 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); 02475 else 02476 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 02477 } 02478 } else { 02479 assert(VA.isMemLoc()); 02480 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 02481 } 02482 02483 // If value is passed via pointer - do a load. 02484 if (VA.getLocInfo() == CCValAssign::Indirect) 02485 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 02486 MachinePointerInfo(), false, false, false, 0); 02487 02488 InVals.push_back(ArgValue); 02489 } 02490 02491 if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { 02492 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 02493 // The x86-64 ABIs require that for returning structs by value we copy 02494 // the sret argument into %rax/%eax (depending on ABI) for the return. 02495 // Win32 requires us to put the sret argument to %eax as well. 02496 // Save the argument into a virtual register so that we can access it 02497 // from the return points. 02498 if (Ins[i].Flags.isSRet()) { 02499 unsigned Reg = FuncInfo->getSRetReturnReg(); 02500 if (!Reg) { 02501 MVT PtrTy = getPointerTy(); 02502 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 02503 FuncInfo->setSRetReturnReg(Reg); 02504 } 02505 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); 02506 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 02507 break; 02508 } 02509 } 02510 } 02511 02512 unsigned StackSize = CCInfo.getNextStackOffset(); 02513 // Align stack specially for tail calls. 02514 if (FuncIsMadeTailCallSafe(CallConv, 02515 MF.getTarget().Options.GuaranteedTailCallOpt)) 02516 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 02517 02518 // If the function takes variable number of arguments, make a frame index for 02519 // the start of the first vararg value... for expansion of llvm.va_start. We 02520 // can skip this if there are no va_start calls. 02521 if (MFI->hasVAStart() && 02522 (Is64Bit || (CallConv != CallingConv::X86_FastCall && 02523 CallConv != CallingConv::X86_ThisCall))) { 02524 FuncInfo->setVarArgsFrameIndex( 02525 MFI->CreateFixedObject(1, StackSize, true)); 02526 } 02527 02528 // 64-bit calling conventions support varargs and register parameters, so we 02529 // have to do extra work to spill them in the prologue or forward them to 02530 // musttail calls. 02531 if (Is64Bit && isVarArg && 02532 (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) { 02533 // Find the first unallocated argument registers. 02534 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); 02535 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); 02536 unsigned NumIntRegs = 02537 CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); 02538 unsigned NumXMMRegs = 02539 CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); 02540 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 02541 "SSE register cannot be used when SSE is disabled!"); 02542 02543 // Gather all the live in physical registers. 02544 SmallVector<SDValue, 6> LiveGPRs; 02545 SmallVector<SDValue, 8> LiveXMMRegs; 02546 SDValue ALVal; 02547 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { 02548 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); 02549 LiveGPRs.push_back( 02550 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); 02551 } 02552 if (!ArgXMMs.empty()) { 02553 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); 02554 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); 02555 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { 02556 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); 02557 LiveXMMRegs.push_back( 02558 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); 02559 } 02560 } 02561 02562 // Store them to the va_list returned by va_start. 02563 if (MFI->hasVAStart()) { 02564 if (IsWin64) { 02565 const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); 02566 // Get to the caller-allocated home save location. Add 8 to account 02567 // for the return address. 02568 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 02569 FuncInfo->setRegSaveFrameIndex( 02570 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 02571 // Fixup to set vararg frame on shadow area (4 x i64). 02572 if (NumIntRegs < 4) 02573 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 02574 } else { 02575 // For X86-64, if there are vararg parameters that are passed via 02576 // registers, then we must store them to their spots on the stack so 02577 // they may be loaded by deferencing the result of va_next. 02578 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 02579 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); 02580 FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( 02581 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); 02582 } 02583 02584 // Store the integer parameter registers. 02585 SmallVector<SDValue, 8> MemOps; 02586 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 02587 getPointerTy()); 02588 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 02589 for (SDValue Val : LiveGPRs) { 02590 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 02591 DAG.getIntPtrConstant(Offset)); 02592 SDValue Store = 02593 DAG.getStore(Val.getValue(1), dl, Val, FIN, 02594 MachinePointerInfo::getFixedStack( 02595 FuncInfo->getRegSaveFrameIndex(), Offset), 02596 false, false, 0); 02597 MemOps.push_back(Store); 02598 Offset += 8; 02599 } 02600 02601 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { 02602 // Now store the XMM (fp + vector) parameter registers. 02603 SmallVector<SDValue, 12> SaveXMMOps; 02604 SaveXMMOps.push_back(Chain); 02605 SaveXMMOps.push_back(ALVal); 02606 SaveXMMOps.push_back(DAG.getIntPtrConstant( 02607 FuncInfo->getRegSaveFrameIndex())); 02608 SaveXMMOps.push_back(DAG.getIntPtrConstant( 02609 FuncInfo->getVarArgsFPOffset())); 02610 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), 02611 LiveXMMRegs.end()); 02612 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 02613 MVT::Other, SaveXMMOps)); 02614 } 02615 02616 if (!MemOps.empty()) 02617 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 02618 } else { 02619 // Add all GPRs, al, and XMMs to the list of forwards. We will add then 02620 // to the liveout set on a musttail call. 02621 assert(MFI->hasMustTailInVarArgFunc()); 02622 auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); 02623 typedef X86MachineFunctionInfo::Forward Forward; 02624 02625 for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) { 02626 unsigned VReg = 02627 MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass); 02628 Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]); 02629 Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64)); 02630 } 02631 02632 if (!ArgXMMs.empty()) { 02633 unsigned ALVReg = 02634 MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass); 02635 Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal); 02636 Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8)); 02637 02638 for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) { 02639 unsigned VReg = 02640 MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass); 02641 Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]); 02642 Forwards.push_back( 02643 Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32)); 02644 } 02645 } 02646 } 02647 } 02648 02649 // Some CCs need callee pop. 02650 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 02651 MF.getTarget().Options.GuaranteedTailCallOpt)) { 02652 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 02653 } else { 02654 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 02655 // If this is an sret function, the return should pop the hidden pointer. 02656 if (!Is64Bit && !IsTailCallConvention(CallConv) && 02657 !Subtarget->getTargetTriple().isOSMSVCRT() && 02658 argsAreStructReturn(Ins) == StackStructReturn) 02659 FuncInfo->setBytesToPopOnReturn(4); 02660 } 02661 02662 if (!Is64Bit) { 02663 // RegSaveFrameIndex is X86-64 only. 02664 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 02665 if (CallConv == CallingConv::X86_FastCall || 02666 CallConv == CallingConv::X86_ThisCall) 02667 // fastcc functions can't have varargs. 02668 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 02669 } 02670 02671 FuncInfo->setArgumentStackSize(StackSize); 02672 02673 return Chain; 02674 } 02675 02676 SDValue 02677 X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 02678 SDValue StackPtr, SDValue Arg, 02679 SDLoc dl, SelectionDAG &DAG, 02680 const CCValAssign &VA, 02681 ISD::ArgFlagsTy Flags) const { 02682 unsigned LocMemOffset = VA.getLocMemOffset(); 02683 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 02684 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 02685 if (Flags.isByVal()) 02686 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 02687 02688 return DAG.getStore(Chain, dl, Arg, PtrOff, 02689 MachinePointerInfo::getStack(LocMemOffset), 02690 false, false, 0); 02691 } 02692 02693 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 02694 /// optimization is performed and it is required. 02695 SDValue 02696 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 02697 SDValue &OutRetAddr, SDValue Chain, 02698 bool IsTailCall, bool Is64Bit, 02699 int FPDiff, SDLoc dl) const { 02700 // Adjust the Return address stack slot. 02701 EVT VT = getPointerTy(); 02702 OutRetAddr = getReturnAddressFrameIndex(DAG); 02703 02704 // Load the "old" Return address. 02705 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 02706 false, false, false, 0); 02707 return SDValue(OutRetAddr.getNode(), 1); 02708 } 02709 02710 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 02711 /// optimization is performed and it is required (FPDiff!=0). 02712 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, 02713 SDValue Chain, SDValue RetAddrFrIdx, 02714 EVT PtrVT, unsigned SlotSize, 02715 int FPDiff, SDLoc dl) { 02716 // Store the return address to the appropriate stack slot. 02717 if (!FPDiff) return Chain; 02718 // Calculate the new stack slot for the return address. 02719 int NewReturnAddrFI = 02720 MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, 02721 false); 02722 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); 02723 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 02724 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 02725 false, false, 0); 02726 return Chain; 02727 } 02728 02729 SDValue 02730 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 02731 SmallVectorImpl<SDValue> &InVals) const { 02732 SelectionDAG &DAG = CLI.DAG; 02733 SDLoc &dl = CLI.DL; 02734 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 02735 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 02736 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 02737 SDValue Chain = CLI.Chain; 02738 SDValue Callee = CLI.Callee; 02739 CallingConv::ID CallConv = CLI.CallConv; 02740 bool &isTailCall = CLI.IsTailCall; 02741 bool isVarArg = CLI.IsVarArg; 02742 02743 MachineFunction &MF = DAG.getMachineFunction(); 02744 bool Is64Bit = Subtarget->is64Bit(); 02745 bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); 02746 StructReturnType SR = callIsStructReturn(Outs); 02747 bool IsSibcall = false; 02748 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); 02749 02750 if (MF.getTarget().Options.DisableTailCalls) 02751 isTailCall = false; 02752 02753 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); 02754 if (IsMustTail) { 02755 // Force this to be a tail call. The verifier rules are enough to ensure 02756 // that we can lower this successfully without moving the return address 02757 // around. 02758 isTailCall = true; 02759 } else if (isTailCall) { 02760 // Check if it's really possible to do a tail call. 02761 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 02762 isVarArg, SR != NotStructReturn, 02763 MF.getFunction()->hasStructRetAttr(), CLI.RetTy, 02764 Outs, OutVals, Ins, DAG); 02765 02766 // Sibcalls are automatically detected tailcalls which do not require 02767 // ABI changes. 02768 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) 02769 IsSibcall = true; 02770 02771 if (isTailCall) 02772 ++NumTailCalls; 02773 } 02774 02775 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 02776 "Var args not supported with calling convention fastcc, ghc or hipe"); 02777 02778 // Analyze operands of the call, assigning locations to each operand. 02779 SmallVector<CCValAssign, 16> ArgLocs; 02780 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 02781 02782 // Allocate shadow area for Win64 02783 if (IsWin64) 02784 CCInfo.AllocateStack(32, 8); 02785 02786 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 02787 02788 // Get a count of how many bytes are to be pushed on the stack. 02789 unsigned NumBytes = CCInfo.getNextStackOffset(); 02790 if (IsSibcall) 02791 // This is a sibcall. The memory operands are available in caller's 02792 // own caller's stack. 02793 NumBytes = 0; 02794 else if (MF.getTarget().Options.GuaranteedTailCallOpt && 02795 IsTailCallConvention(CallConv)) 02796 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 02797 02798 int FPDiff = 0; 02799 if (isTailCall && !IsSibcall && !IsMustTail) { 02800 // Lower arguments at fp - stackoffset + fpdiff. 02801 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); 02802 02803 FPDiff = NumBytesCallerPushed - NumBytes; 02804 02805 // Set the delta of movement of the returnaddr stackslot. 02806 // But only set if delta is greater than previous delta. 02807 if (FPDiff < X86Info->getTCReturnAddrDelta()) 02808 X86Info->setTCReturnAddrDelta(FPDiff); 02809 } 02810 02811 unsigned NumBytesToPush = NumBytes; 02812 unsigned NumBytesToPop = NumBytes; 02813 02814 // If we have an inalloca argument, all stack space has already been allocated 02815 // for us and be right at the top of the stack. We don't support multiple 02816 // arguments passed in memory when using inalloca. 02817 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { 02818 NumBytesToPush = 0; 02819 if (!ArgLocs.back().isMemLoc()) 02820 report_fatal_error("cannot use inalloca attribute on a register " 02821 "parameter"); 02822 if (ArgLocs.back().getLocMemOffset() != 0) 02823 report_fatal_error("any parameter with the inalloca attribute must be " 02824 "the only memory argument"); 02825 } 02826 02827 if (!IsSibcall) 02828 Chain = DAG.getCALLSEQ_START( 02829 Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl); 02830 02831 SDValue RetAddrFrIdx; 02832 // Load return address for tail calls. 02833 if (isTailCall && FPDiff) 02834 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 02835 Is64Bit, FPDiff, dl); 02836 02837 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 02838 SmallVector<SDValue, 8> MemOpChains; 02839 SDValue StackPtr; 02840 02841 // Walk the register/memloc assignments, inserting copies/loads. In the case 02842 // of tail call optimization arguments are handle later. 02843 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 02844 DAG.getSubtarget().getRegisterInfo()); 02845 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 02846 // Skip inalloca arguments, they have already been written. 02847 ISD::ArgFlagsTy Flags = Outs[i].Flags; 02848 if (Flags.isInAlloca()) 02849 continue; 02850 02851 CCValAssign &VA = ArgLocs[i]; 02852 EVT RegVT = VA.getLocVT(); 02853 SDValue Arg = OutVals[i]; 02854 bool isByVal = Flags.isByVal(); 02855 02856 // Promote the value if needed. 02857 switch (VA.getLocInfo()) { 02858 default: llvm_unreachable("Unknown loc info!"); 02859 case CCValAssign::Full: break; 02860 case CCValAssign::SExt: 02861 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 02862 break; 02863 case CCValAssign::ZExt: 02864 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 02865 break; 02866 case CCValAssign::AExt: 02867 if (RegVT.is128BitVector()) { 02868 // Special case: passing MMX values in XMM registers. 02869 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 02870 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 02871 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 02872 } else 02873 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 02874 break; 02875 case CCValAssign::BCvt: 02876 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 02877 break; 02878 case CCValAssign::Indirect: { 02879 // Store the argument. 02880 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 02881 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 02882 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 02883 MachinePointerInfo::getFixedStack(FI), 02884 false, false, 0); 02885 Arg = SpillSlot; 02886 break; 02887 } 02888 } 02889 02890 if (VA.isRegLoc()) { 02891 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 02892 if (isVarArg && IsWin64) { 02893 // Win64 ABI requires argument XMM reg to be copied to the corresponding 02894 // shadow reg if callee is a varargs function. 02895 unsigned ShadowReg = 0; 02896 switch (VA.getLocReg()) { 02897 case X86::XMM0: ShadowReg = X86::RCX; break; 02898 case X86::XMM1: ShadowReg = X86::RDX; break; 02899 case X86::XMM2: ShadowReg = X86::R8; break; 02900 case X86::XMM3: ShadowReg = X86::R9; break; 02901 } 02902 if (ShadowReg) 02903 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 02904 } 02905 } else if (!IsSibcall && (!isTailCall || isByVal)) { 02906 assert(VA.isMemLoc()); 02907 if (!StackPtr.getNode()) 02908 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), 02909 getPointerTy()); 02910 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 02911 dl, DAG, VA, Flags)); 02912 } 02913 } 02914 02915 if (!MemOpChains.empty()) 02916 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 02917 02918 if (Subtarget->isPICStyleGOT()) { 02919 // ELF / PIC requires GOT in the EBX register before function calls via PLT 02920 // GOT pointer. 02921 if (!isTailCall) { 02922 RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), 02923 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); 02924 } else { 02925 // If we are tail calling and generating PIC/GOT style code load the 02926 // address of the callee into ECX. The value in ecx is used as target of 02927 // the tail jump. This is done to circumvent the ebx/callee-saved problem 02928 // for tail calls on PIC/GOT architectures. Normally we would just put the 02929 // address of GOT into ebx and then call target@PLT. But for tail calls 02930 // ebx would be restored (since ebx is callee saved) before jumping to the 02931 // target@PLT. 02932 02933 // Note: The actual moving to ECX is done further down. 02934 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 02935 if (G && !G->getGlobal()->hasHiddenVisibility() && 02936 !G->getGlobal()->hasProtectedVisibility()) 02937 Callee = LowerGlobalAddress(Callee, DAG); 02938 else if (isa<ExternalSymbolSDNode>(Callee)) 02939 Callee = LowerExternalSymbol(Callee, DAG); 02940 } 02941 } 02942 02943 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { 02944 // From AMD64 ABI document: 02945 // For calls that may call functions that use varargs or stdargs 02946 // (prototype-less calls or calls to functions containing ellipsis (...) in 02947 // the declaration) %al is used as hidden argument to specify the number 02948 // of SSE registers used. The contents of %al do not need to match exactly 02949 // the number of registers, but must be an ubound on the number of SSE 02950 // registers used and is in the range 0 - 8 inclusive. 02951 02952 // Count the number of XMM registers allocated. 02953 static const MCPhysReg XMMArgRegs[] = { 02954 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 02955 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 02956 }; 02957 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 02958 assert((Subtarget->hasSSE1() || !NumXMMRegs) 02959 && "SSE registers cannot be used when SSE is disabled"); 02960 02961 RegsToPass.push_back(std::make_pair(unsigned(X86::AL), 02962 DAG.getConstant(NumXMMRegs, MVT::i8))); 02963 } 02964 02965 if (Is64Bit && isVarArg && IsMustTail) { 02966 const auto &Forwards = X86Info->getForwardedMustTailRegParms(); 02967 for (const auto &F : Forwards) { 02968 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); 02969 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); 02970 } 02971 } 02972 02973 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls 02974 // don't need this because the eligibility check rejects calls that require 02975 // shuffling arguments passed in memory. 02976 if (!IsSibcall && isTailCall) { 02977 // Force all the incoming stack arguments to be loaded from the stack 02978 // before any new outgoing arguments are stored to the stack, because the 02979 // outgoing stack slots may alias the incoming argument stack slots, and 02980 // the alias isn't otherwise explicit. This is slightly more conservative 02981 // than necessary, because it means that each store effectively depends 02982 // on every argument instead of just those arguments it would clobber. 02983 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 02984 02985 SmallVector<SDValue, 8> MemOpChains2; 02986 SDValue FIN; 02987 int FI = 0; 02988 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 02989 CCValAssign &VA = ArgLocs[i]; 02990 if (VA.isRegLoc()) 02991 continue; 02992 assert(VA.isMemLoc()); 02993 SDValue Arg = OutVals[i]; 02994 ISD::ArgFlagsTy Flags = Outs[i].Flags; 02995 // Skip inalloca arguments. They don't require any work. 02996 if (Flags.isInAlloca()) 02997 continue; 02998 // Create frame index. 02999 int32_t Offset = VA.getLocMemOffset()+FPDiff; 03000 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 03001 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 03002 FIN = DAG.getFrameIndex(FI, getPointerTy()); 03003 03004 if (Flags.isByVal()) { 03005 // Copy relative to framepointer. 03006 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 03007 if (!StackPtr.getNode()) 03008 StackPtr = DAG.getCopyFromReg(Chain, dl, 03009 RegInfo->getStackRegister(), 03010 getPointerTy()); 03011 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 03012 03013 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 03014 ArgChain, 03015 Flags, DAG, dl)); 03016 } else { 03017 // Store relative to framepointer. 03018 MemOpChains2.push_back( 03019 DAG.getStore(ArgChain, dl, Arg, FIN, 03020 MachinePointerInfo::getFixedStack(FI), 03021 false, false, 0)); 03022 } 03023 } 03024 03025 if (!MemOpChains2.empty()) 03026 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 03027 03028 // Store the return address to the appropriate stack slot. 03029 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, 03030 getPointerTy(), RegInfo->getSlotSize(), 03031 FPDiff, dl); 03032 } 03033 03034 // Build a sequence of copy-to-reg nodes chained together with token chain 03035 // and flag operands which copy the outgoing args into registers. 03036 SDValue InFlag; 03037 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 03038 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 03039 RegsToPass[i].second, InFlag); 03040 InFlag = Chain.getValue(1); 03041 } 03042 03043 if (DAG.getTarget().getCodeModel() == CodeModel::Large) { 03044 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 03045 // In the 64-bit large code model, we have to make all calls 03046 // through a register, since the call instruction's 32-bit 03047 // pc-relative offset may not be large enough to hold the whole 03048 // address. 03049 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 03050 // If the callee is a GlobalAddress node (quite common, every direct call 03051 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 03052 // it. 03053 03054 // We should use extra load for direct calls to dllimported functions in 03055 // non-JIT mode. 03056 const GlobalValue *GV = G->getGlobal(); 03057 if (!GV->hasDLLImportStorageClass()) { 03058 unsigned char OpFlags = 0; 03059 bool ExtraLoad = false; 03060 unsigned WrapperKind = ISD::DELETED_NODE; 03061 03062 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 03063 // external symbols most go through the PLT in PIC mode. If the symbol 03064 // has hidden or protected visibility, or if it is static or local, then 03065 // we don't need to use the PLT - we can directly call it. 03066 if (Subtarget->isTargetELF() && 03067 DAG.getTarget().getRelocationModel() == Reloc::PIC_ && 03068 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 03069 OpFlags = X86II::MO_PLT; 03070 } else if (Subtarget->isPICStyleStubAny() && 03071 (GV->isDeclaration() || GV->isWeakForLinker()) && 03072 (!Subtarget->getTargetTriple().isMacOSX() || 03073 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 03074 // PC-relative references to external symbols should go through $stub, 03075 // unless we're building with the leopard linker or later, which 03076 // automatically synthesizes these stubs. 03077 OpFlags = X86II::MO_DARWIN_STUB; 03078 } else if (Subtarget->isPICStyleRIPRel() && 03079 isa<Function>(GV) && 03080 cast<Function>(GV)->getAttributes(). 03081 hasAttribute(AttributeSet::FunctionIndex, 03082 Attribute::NonLazyBind)) { 03083 // If the function is marked as non-lazy, generate an indirect call 03084 // which loads from the GOT directly. This avoids runtime overhead 03085 // at the cost of eager binding (and one extra byte of encoding). 03086 OpFlags = X86II::MO_GOTPCREL; 03087 WrapperKind = X86ISD::WrapperRIP; 03088 ExtraLoad = true; 03089 } 03090 03091 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 03092 G->getOffset(), OpFlags); 03093 03094 // Add a wrapper if needed. 03095 if (WrapperKind != ISD::DELETED_NODE) 03096 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 03097 // Add extra indirection if needed. 03098 if (ExtraLoad) 03099 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 03100 MachinePointerInfo::getGOT(), 03101 false, false, false, 0); 03102 } 03103 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 03104 unsigned char OpFlags = 0; 03105 03106 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 03107 // external symbols should go through the PLT. 03108 if (Subtarget->isTargetELF() && 03109 DAG.getTarget().getRelocationModel() == Reloc::PIC_) { 03110 OpFlags = X86II::MO_PLT; 03111 } else if (Subtarget->isPICStyleStubAny() && 03112 (!Subtarget->getTargetTriple().isMacOSX() || 03113 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 03114 // PC-relative references to external symbols should go through $stub, 03115 // unless we're building with the leopard linker or later, which 03116 // automatically synthesizes these stubs. 03117 OpFlags = X86II::MO_DARWIN_STUB; 03118 } 03119 03120 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 03121 OpFlags); 03122 } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { 03123 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI 03124 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); 03125 } 03126 03127 // Returns a chain & a flag for retval copy to use. 03128 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 03129 SmallVector<SDValue, 8> Ops; 03130 03131 if (!IsSibcall && isTailCall) { 03132 Chain = DAG.getCALLSEQ_END(Chain, 03133 DAG.getIntPtrConstant(NumBytesToPop, true), 03134 DAG.getIntPtrConstant(0, true), InFlag, dl); 03135 InFlag = Chain.getValue(1); 03136 } 03137 03138 Ops.push_back(Chain); 03139 Ops.push_back(Callee); 03140 03141 if (isTailCall) 03142 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 03143 03144 // Add argument registers to the end of the list so that they are known live 03145 // into the call. 03146 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 03147 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 03148 RegsToPass[i].second.getValueType())); 03149 03150 // Add a register mask operand representing the call-preserved registers. 03151 const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); 03152 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 03153 assert(Mask && "Missing call preserved mask for calling convention"); 03154 Ops.push_back(DAG.getRegisterMask(Mask)); 03155 03156 if (InFlag.getNode()) 03157 Ops.push_back(InFlag); 03158 03159 if (isTailCall) { 03160 // We used to do: 03161 //// If this is the first return lowered for this function, add the regs 03162 //// to the liveout set for the function. 03163 // This isn't right, although it's probably harmless on x86; liveouts 03164 // should be computed from returns not tail calls. Consider a void 03165 // function making a tail call to a function returning int. 03166 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); 03167 } 03168 03169 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); 03170 InFlag = Chain.getValue(1); 03171 03172 // Create the CALLSEQ_END node. 03173 unsigned NumBytesForCalleeToPop; 03174 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, 03175 DAG.getTarget().Options.GuaranteedTailCallOpt)) 03176 NumBytesForCalleeToPop = NumBytes; // Callee pops everything 03177 else if (!Is64Bit && !IsTailCallConvention(CallConv) && 03178 !Subtarget->getTargetTriple().isOSMSVCRT() && 03179 SR == StackStructReturn) 03180 // If this is a call to a struct-return function, the callee 03181 // pops the hidden struct pointer, so we have to push it back. 03182 // This is common for Darwin/X86, Linux & Mingw32 targets. 03183 // For MSVC Win32 targets, the caller pops the hidden struct pointer. 03184 NumBytesForCalleeToPop = 4; 03185 else 03186 NumBytesForCalleeToPop = 0; // Callee pops nothing. 03187 03188 // Returns a flag for retval copy to use. 03189 if (!IsSibcall) { 03190 Chain = DAG.getCALLSEQ_END(Chain, 03191 DAG.getIntPtrConstant(NumBytesToPop, true), 03192 DAG.getIntPtrConstant(NumBytesForCalleeToPop, 03193 true), 03194 InFlag, dl); 03195 InFlag = Chain.getValue(1); 03196 } 03197 03198 // Handle result values, copying them out of physregs into vregs that we 03199 // return. 03200 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 03201 Ins, dl, DAG, InVals); 03202 } 03203 03204 //===----------------------------------------------------------------------===// 03205 // Fast Calling Convention (tail call) implementation 03206 //===----------------------------------------------------------------------===// 03207 03208 // Like std call, callee cleans arguments, convention except that ECX is 03209 // reserved for storing the tail called function address. Only 2 registers are 03210 // free for argument passing (inreg). Tail call optimization is performed 03211 // provided: 03212 // * tailcallopt is enabled 03213 // * caller/callee are fastcc 03214 // On X86_64 architecture with GOT-style position independent code only local 03215 // (within module) calls are supported at the moment. 03216 // To keep the stack aligned according to platform abi the function 03217 // GetAlignedArgumentStackSize ensures that argument delta is always multiples 03218 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 03219 // If a tail called function callee has more arguments than the caller the 03220 // caller needs to make sure that there is room to move the RETADDR to. This is 03221 // achieved by reserving an area the size of the argument delta right after the 03222 // original RETADDR, but before the saved framepointer or the spilled registers 03223 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 03224 // stack layout: 03225 // arg1 03226 // arg2 03227 // RETADDR 03228 // [ new RETADDR 03229 // move area ] 03230 // (possible EBP) 03231 // ESI 03232 // EDI 03233 // local1 .. 03234 03235 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 03236 /// for a 16 byte align requirement. 03237 unsigned 03238 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 03239 SelectionDAG& DAG) const { 03240 MachineFunction &MF = DAG.getMachineFunction(); 03241 const TargetMachine &TM = MF.getTarget(); 03242 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 03243 TM.getSubtargetImpl()->getRegisterInfo()); 03244 const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); 03245 unsigned StackAlignment = TFI.getStackAlignment(); 03246 uint64_t AlignMask = StackAlignment - 1; 03247 int64_t Offset = StackSize; 03248 unsigned SlotSize = RegInfo->getSlotSize(); 03249 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 03250 // Number smaller than 12 so just add the difference. 03251 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 03252 } else { 03253 // Mask out lower bits, add stackalignment once plus the 12 bytes. 03254 Offset = ((~AlignMask) & Offset) + StackAlignment + 03255 (StackAlignment-SlotSize); 03256 } 03257 return Offset; 03258 } 03259 03260 /// MatchingStackOffset - Return true if the given stack call argument is 03261 /// already available in the same position (relatively) of the caller's 03262 /// incoming argument stack. 03263 static 03264 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 03265 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 03266 const X86InstrInfo *TII) { 03267 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 03268 int FI = INT_MAX; 03269 if (Arg.getOpcode() == ISD::CopyFromReg) { 03270 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 03271 if (!TargetRegisterInfo::isVirtualRegister(VR)) 03272 return false; 03273 MachineInstr *Def = MRI->getVRegDef(VR); 03274 if (!Def) 03275 return false; 03276 if (!Flags.isByVal()) { 03277 if (!TII->isLoadFromStackSlot(Def, FI)) 03278 return false; 03279 } else { 03280 unsigned Opcode = Def->getOpcode(); 03281 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 03282 Def->getOperand(1).isFI()) { 03283 FI = Def->getOperand(1).getIndex(); 03284 Bytes = Flags.getByValSize(); 03285 } else 03286 return false; 03287 } 03288 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 03289 if (Flags.isByVal()) 03290 // ByVal argument is passed in as a pointer but it's now being 03291 // dereferenced. e.g. 03292 // define @foo(%struct.X* %A) { 03293 // tail call @bar(%struct.X* byval %A) 03294 // } 03295 return false; 03296 SDValue Ptr = Ld->getBasePtr(); 03297 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 03298 if (!FINode) 03299 return false; 03300 FI = FINode->getIndex(); 03301 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 03302 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 03303 FI = FINode->getIndex(); 03304 Bytes = Flags.getByValSize(); 03305 } else 03306 return false; 03307 03308 assert(FI != INT_MAX); 03309 if (!MFI->isFixedObjectIndex(FI)) 03310 return false; 03311 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 03312 } 03313 03314 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 03315 /// for tail call optimization. Targets which want to do tail call 03316 /// optimization should implement this function. 03317 bool 03318 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 03319 CallingConv::ID CalleeCC, 03320 bool isVarArg, 03321 bool isCalleeStructRet, 03322 bool isCallerStructRet, 03323 Type *RetTy, 03324 const SmallVectorImpl<ISD::OutputArg> &Outs, 03325 const SmallVectorImpl<SDValue> &OutVals, 03326 const SmallVectorImpl<ISD::InputArg> &Ins, 03327 SelectionDAG &DAG) const { 03328 if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) 03329 return false; 03330 03331 // If -tailcallopt is specified, make fastcc functions tail-callable. 03332 const MachineFunction &MF = DAG.getMachineFunction(); 03333 const Function *CallerF = MF.getFunction(); 03334 03335 // If the function return type is x86_fp80 and the callee return type is not, 03336 // then the FP_EXTEND of the call result is not a nop. It's not safe to 03337 // perform a tailcall optimization here. 03338 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) 03339 return false; 03340 03341 CallingConv::ID CallerCC = CallerF->getCallingConv(); 03342 bool CCMatch = CallerCC == CalleeCC; 03343 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 03344 bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); 03345 03346 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 03347 if (IsTailCallConvention(CalleeCC) && CCMatch) 03348 return true; 03349 return false; 03350 } 03351 03352 // Look for obvious safe cases to perform tail call optimization that do not 03353 // require ABI changes. This is what gcc calls sibcall. 03354 03355 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 03356 // emit a special epilogue. 03357 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 03358 DAG.getSubtarget().getRegisterInfo()); 03359 if (RegInfo->needsStackRealignment(MF)) 03360 return false; 03361 03362 // Also avoid sibcall optimization if either caller or callee uses struct 03363 // return semantics. 03364 if (isCalleeStructRet || isCallerStructRet) 03365 return false; 03366 03367 // An stdcall/thiscall caller is expected to clean up its arguments; the 03368 // callee isn't going to do that. 03369 // FIXME: this is more restrictive than needed. We could produce a tailcall 03370 // when the stack adjustment matches. For example, with a thiscall that takes 03371 // only one argument. 03372 if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || 03373 CallerCC == CallingConv::X86_ThisCall)) 03374 return false; 03375 03376 // Do not sibcall optimize vararg calls unless all arguments are passed via 03377 // registers. 03378 if (isVarArg && !Outs.empty()) { 03379 03380 // Optimizing for varargs on Win64 is unlikely to be safe without 03381 // additional testing. 03382 if (IsCalleeWin64 || IsCallerWin64) 03383 return false; 03384 03385 SmallVector<CCValAssign, 16> ArgLocs; 03386 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 03387 *DAG.getContext()); 03388 03389 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 03390 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 03391 if (!ArgLocs[i].isRegLoc()) 03392 return false; 03393 } 03394 03395 // If the call result is in ST0 / ST1, it needs to be popped off the x87 03396 // stack. Therefore, if it's not used by the call it is not safe to optimize 03397 // this into a sibcall. 03398 bool Unused = false; 03399 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 03400 if (!Ins[i].Used) { 03401 Unused = true; 03402 break; 03403 } 03404 } 03405 if (Unused) { 03406 SmallVector<CCValAssign, 16> RVLocs; 03407 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, 03408 *DAG.getContext()); 03409 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 03410 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 03411 CCValAssign &VA = RVLocs[i]; 03412 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) 03413 return false; 03414 } 03415 } 03416 03417 // If the calling conventions do not match, then we'd better make sure the 03418 // results are returned in the same way as what the caller expects. 03419 if (!CCMatch) { 03420 SmallVector<CCValAssign, 16> RVLocs1; 03421 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 03422 *DAG.getContext()); 03423 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 03424 03425 SmallVector<CCValAssign, 16> RVLocs2; 03426 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 03427 *DAG.getContext()); 03428 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 03429 03430 if (RVLocs1.size() != RVLocs2.size()) 03431 return false; 03432 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 03433 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 03434 return false; 03435 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 03436 return false; 03437 if (RVLocs1[i].isRegLoc()) { 03438 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 03439 return false; 03440 } else { 03441 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 03442 return false; 03443 } 03444 } 03445 } 03446 03447 // If the callee takes no arguments then go on to check the results of the 03448 // call. 03449 if (!Outs.empty()) { 03450 // Check if stack adjustment is needed. For now, do not do this if any 03451 // argument is passed on the stack. 03452 SmallVector<CCValAssign, 16> ArgLocs; 03453 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 03454 *DAG.getContext()); 03455 03456 // Allocate shadow area for Win64 03457 if (IsCalleeWin64) 03458 CCInfo.AllocateStack(32, 8); 03459 03460 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 03461 if (CCInfo.getNextStackOffset()) { 03462 MachineFunction &MF = DAG.getMachineFunction(); 03463 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 03464 return false; 03465 03466 // Check if the arguments are already laid out in the right way as 03467 // the caller's fixed stack objects. 03468 MachineFrameInfo *MFI = MF.getFrameInfo(); 03469 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 03470 const X86InstrInfo *TII = 03471 static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 03472 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 03473 CCValAssign &VA = ArgLocs[i]; 03474 SDValue Arg = OutVals[i]; 03475 ISD::ArgFlagsTy Flags = Outs[i].Flags; 03476 if (VA.getLocInfo() == CCValAssign::Indirect) 03477 return false; 03478 if (!VA.isRegLoc()) { 03479 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 03480 MFI, MRI, TII)) 03481 return false; 03482 } 03483 } 03484 } 03485 03486 // If the tailcall address may be in a register, then make sure it's 03487 // possible to register allocate for it. In 32-bit, the call address can 03488 // only target EAX, EDX, or ECX since the tail call must be scheduled after 03489 // callee-saved registers are restored. These happen to be the same 03490 // registers used to pass 'inreg' arguments so watch out for those. 03491 if (!Subtarget->is64Bit() && 03492 ((!isa<GlobalAddressSDNode>(Callee) && 03493 !isa<ExternalSymbolSDNode>(Callee)) || 03494 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { 03495 unsigned NumInRegs = 0; 03496 // In PIC we need an extra register to formulate the address computation 03497 // for the callee. 03498 unsigned MaxInRegs = 03499 (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; 03500 03501 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 03502 CCValAssign &VA = ArgLocs[i]; 03503 if (!VA.isRegLoc()) 03504 continue; 03505 unsigned Reg = VA.getLocReg(); 03506 switch (Reg) { 03507 default: break; 03508 case X86::EAX: case X86::EDX: case X86::ECX: 03509 if (++NumInRegs == MaxInRegs) 03510 return false; 03511 break; 03512 } 03513 } 03514 } 03515 } 03516 03517 return true; 03518 } 03519 03520 FastISel * 03521 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 03522 const TargetLibraryInfo *libInfo) const { 03523 return X86::createFastISel(funcInfo, libInfo); 03524 } 03525 03526 //===----------------------------------------------------------------------===// 03527 // Other Lowering Hooks 03528 //===----------------------------------------------------------------------===// 03529 03530 static bool MayFoldLoad(SDValue Op) { 03531 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 03532 } 03533 03534 static bool MayFoldIntoStore(SDValue Op) { 03535 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 03536 } 03537 03538 static bool isTargetShuffle(unsigned Opcode) { 03539 switch(Opcode) { 03540 default: return false; 03541 case X86ISD::PSHUFB: 03542 case X86ISD::PSHUFD: 03543 case X86ISD::PSHUFHW: 03544 case X86ISD::PSHUFLW: 03545 case X86ISD::SHUFP: 03546 case X86ISD::PALIGNR: 03547 case X86ISD::MOVLHPS: 03548 case X86ISD::MOVLHPD: 03549 case X86ISD::MOVHLPS: 03550 case X86ISD::MOVLPS: 03551 case X86ISD::MOVLPD: 03552 case X86ISD::MOVSHDUP: 03553 case X86ISD::MOVSLDUP: 03554 case X86ISD::MOVDDUP: 03555 case X86ISD::MOVSS: 03556 case X86ISD::MOVSD: 03557 case X86ISD::UNPCKL: 03558 case X86ISD::UNPCKH: 03559 case X86ISD::VPERMILP: 03560 case X86ISD::VPERM2X128: 03561 case X86ISD::VPERMI: 03562 return true; 03563 } 03564 } 03565 03566 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 03567 SDValue V1, SelectionDAG &DAG) { 03568 switch(Opc) { 03569 default: llvm_unreachable("Unknown x86 shuffle node"); 03570 case X86ISD::MOVSHDUP: 03571 case X86ISD::MOVSLDUP: 03572 case X86ISD::MOVDDUP: 03573 return DAG.getNode(Opc, dl, VT, V1); 03574 } 03575 } 03576 03577 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 03578 SDValue V1, unsigned TargetMask, 03579 SelectionDAG &DAG) { 03580 switch(Opc) { 03581 default: llvm_unreachable("Unknown x86 shuffle node"); 03582 case X86ISD::PSHUFD: 03583 case X86ISD::PSHUFHW: 03584 case X86ISD::PSHUFLW: 03585 case X86ISD::VPERMILP: 03586 case X86ISD::VPERMI: 03587 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 03588 } 03589 } 03590 03591 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 03592 SDValue V1, SDValue V2, unsigned TargetMask, 03593 SelectionDAG &DAG) { 03594 switch(Opc) { 03595 default: llvm_unreachable("Unknown x86 shuffle node"); 03596 case X86ISD::PALIGNR: 03597 case X86ISD::VALIGN: 03598 case X86ISD::SHUFP: 03599 case X86ISD::VPERM2X128: 03600 return DAG.getNode(Opc, dl, VT, V1, V2, 03601 DAG.getConstant(TargetMask, MVT::i8)); 03602 } 03603 } 03604 03605 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, 03606 SDValue V1, SDValue V2, SelectionDAG &DAG) { 03607 switch(Opc) { 03608 default: llvm_unreachable("Unknown x86 shuffle node"); 03609 case X86ISD::MOVLHPS: 03610 case X86ISD::MOVLHPD: 03611 case X86ISD::MOVHLPS: 03612 case X86ISD::MOVLPS: 03613 case X86ISD::MOVLPD: 03614 case X86ISD::MOVSS: 03615 case X86ISD::MOVSD: 03616 case X86ISD::UNPCKL: 03617 case X86ISD::UNPCKH: 03618 return DAG.getNode(Opc, dl, VT, V1, V2); 03619 } 03620 } 03621 03622 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 03623 MachineFunction &MF = DAG.getMachineFunction(); 03624 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 03625 DAG.getSubtarget().getRegisterInfo()); 03626 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 03627 int ReturnAddrIndex = FuncInfo->getRAIndex(); 03628 03629 if (ReturnAddrIndex == 0) { 03630 // Set up a frame object for the return address. 03631 unsigned SlotSize = RegInfo->getSlotSize(); 03632 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, 03633 -(int64_t)SlotSize, 03634 false); 03635 FuncInfo->setRAIndex(ReturnAddrIndex); 03636 } 03637 03638 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 03639 } 03640 03641 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 03642 bool hasSymbolicDisplacement) { 03643 // Offset should fit into 32 bit immediate field. 03644 if (!isInt<32>(Offset)) 03645 return false; 03646 03647 // If we don't have a symbolic displacement - we don't have any extra 03648 // restrictions. 03649 if (!hasSymbolicDisplacement) 03650 return true; 03651 03652 // FIXME: Some tweaks might be needed for medium code model. 03653 if (M != CodeModel::Small && M != CodeModel::Kernel) 03654 return false; 03655 03656 // For small code model we assume that latest object is 16MB before end of 31 03657 // bits boundary. We may also accept pretty large negative constants knowing 03658 // that all objects are in the positive half of address space. 03659 if (M == CodeModel::Small && Offset < 16*1024*1024) 03660 return true; 03661 03662 // For kernel code model we know that all object resist in the negative half 03663 // of 32bits address space. We may not accept negative offsets, since they may 03664 // be just off and we may accept pretty large positive ones. 03665 if (M == CodeModel::Kernel && Offset > 0) 03666 return true; 03667 03668 return false; 03669 } 03670 03671 /// isCalleePop - Determines whether the callee is required to pop its 03672 /// own arguments. Callee pop is necessary to support tail calls. 03673 bool X86::isCalleePop(CallingConv::ID CallingConv, 03674 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 03675 switch (CallingConv) { 03676 default: 03677 return false; 03678 case CallingConv::X86_StdCall: 03679 case CallingConv::X86_FastCall: 03680 case CallingConv::X86_ThisCall: 03681 return !is64Bit; 03682 case CallingConv::Fast: 03683 case CallingConv::GHC: 03684 case CallingConv::HiPE: 03685 if (IsVarArg) 03686 return false; 03687 return TailCallOpt; 03688 } 03689 } 03690 03691 /// \brief Return true if the condition is an unsigned comparison operation. 03692 static bool isX86CCUnsigned(unsigned X86CC) { 03693 switch (X86CC) { 03694 default: llvm_unreachable("Invalid integer condition!"); 03695 case X86::COND_E: return true; 03696 case X86::COND_G: return false; 03697 case X86::COND_GE: return false; 03698 case X86::COND_L: return false; 03699 case X86::COND_LE: return false; 03700 case X86::COND_NE: return true; 03701 case X86::COND_B: return true; 03702 case X86::COND_A: return true; 03703 case X86::COND_BE: return true; 03704 case X86::COND_AE: return true; 03705 } 03706 llvm_unreachable("covered switch fell through?!"); 03707 } 03708 03709 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 03710 /// specific condition code, returning the condition code and the LHS/RHS of the 03711 /// comparison to make. 03712 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 03713 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 03714 if (!isFP) { 03715 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 03716 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 03717 // X > -1 -> X == 0, jump !sign. 03718 RHS = DAG.getConstant(0, RHS.getValueType()); 03719 return X86::COND_NS; 03720 } 03721 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 03722 // X < 0 -> X == 0, jump on sign. 03723 return X86::COND_S; 03724 } 03725 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 03726 // X < 1 -> X <= 0 03727 RHS = DAG.getConstant(0, RHS.getValueType()); 03728 return X86::COND_LE; 03729 } 03730 } 03731 03732 switch (SetCCOpcode) { 03733 default: llvm_unreachable("Invalid integer condition!"); 03734 case ISD::SETEQ: return X86::COND_E; 03735 case ISD::SETGT: return X86::COND_G; 03736 case ISD::SETGE: return X86::COND_GE; 03737 case ISD::SETLT: return X86::COND_L; 03738 case ISD::SETLE: return X86::COND_LE; 03739 case ISD::SETNE: return X86::COND_NE; 03740 case ISD::SETULT: return X86::COND_B; 03741 case ISD::SETUGT: return X86::COND_A; 03742 case ISD::SETULE: return X86::COND_BE; 03743 case ISD::SETUGE: return X86::COND_AE; 03744 } 03745 } 03746 03747 // First determine if it is required or is profitable to flip the operands. 03748 03749 // If LHS is a foldable load, but RHS is not, flip the condition. 03750 if (ISD::isNON_EXTLoad(LHS.getNode()) && 03751 !ISD::isNON_EXTLoad(RHS.getNode())) { 03752 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 03753 std::swap(LHS, RHS); 03754 } 03755 03756 switch (SetCCOpcode) { 03757 default: break; 03758 case ISD::SETOLT: 03759 case ISD::SETOLE: 03760 case ISD::SETUGT: 03761 case ISD::SETUGE: 03762 std::swap(LHS, RHS); 03763 break; 03764 } 03765 03766 // On a floating point condition, the flags are set as follows: 03767 // ZF PF CF op 03768 // 0 | 0 | 0 | X > Y 03769 // 0 | 0 | 1 | X < Y 03770 // 1 | 0 | 0 | X == Y 03771 // 1 | 1 | 1 | unordered 03772 switch (SetCCOpcode) { 03773 default: llvm_unreachable("Condcode should be pre-legalized away"); 03774 case ISD::SETUEQ: 03775 case ISD::SETEQ: return X86::COND_E; 03776 case ISD::SETOLT: // flipped 03777 case ISD::SETOGT: 03778 case ISD::SETGT: return X86::COND_A; 03779 case ISD::SETOLE: // flipped 03780 case ISD::SETOGE: 03781 case ISD::SETGE: return X86::COND_AE; 03782 case ISD::SETUGT: // flipped 03783 case ISD::SETULT: 03784 case ISD::SETLT: return X86::COND_B; 03785 case ISD::SETUGE: // flipped 03786 case ISD::SETULE: 03787 case ISD::SETLE: return X86::COND_BE; 03788 case ISD::SETONE: 03789 case ISD::SETNE: return X86::COND_NE; 03790 case ISD::SETUO: return X86::COND_P; 03791 case ISD::SETO: return X86::COND_NP; 03792 case ISD::SETOEQ: 03793 case ISD::SETUNE: return X86::COND_INVALID; 03794 } 03795 } 03796 03797 /// hasFPCMov - is there a floating point cmov for the specific X86 condition 03798 /// code. Current x86 isa includes the following FP cmov instructions: 03799 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 03800 static bool hasFPCMov(unsigned X86CC) { 03801 switch (X86CC) { 03802 default: 03803 return false; 03804 case X86::COND_B: 03805 case X86::COND_BE: 03806 case X86::COND_E: 03807 case X86::COND_P: 03808 case X86::COND_A: 03809 case X86::COND_AE: 03810 case X86::COND_NE: 03811 case X86::COND_NP: 03812 return true; 03813 } 03814 } 03815 03816 /// isFPImmLegal - Returns true if the target can instruction select the 03817 /// specified FP immediate natively. If false, the legalizer will 03818 /// materialize the FP immediate as a load from a constant pool. 03819 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 03820 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 03821 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 03822 return true; 03823 } 03824 return false; 03825 } 03826 03827 /// \brief Returns true if it is beneficial to convert a load of a constant 03828 /// to just the constant itself. 03829 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 03830 Type *Ty) const { 03831 assert(Ty->isIntegerTy()); 03832 03833 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 03834 if (BitSize == 0 || BitSize > 64) 03835 return false; 03836 return true; 03837 } 03838 03839 /// isUndefOrInRange - Return true if Val is undef or if its value falls within 03840 /// the specified range (L, H]. 03841 static bool isUndefOrInRange(int Val, int Low, int Hi) { 03842 return (Val < 0) || (Val >= Low && Val < Hi); 03843 } 03844 03845 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the 03846 /// specified value. 03847 static bool isUndefOrEqual(int Val, int CmpVal) { 03848 return (Val < 0 || Val == CmpVal); 03849 } 03850 03851 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning 03852 /// from position Pos and ending in Pos+Size, falls within the specified 03853 /// sequential range (L, L+Pos]. or is undef. 03854 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, 03855 unsigned Pos, unsigned Size, int Low) { 03856 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) 03857 if (!isUndefOrEqual(Mask[i], Low)) 03858 return false; 03859 return true; 03860 } 03861 03862 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 03863 /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 03864 /// the second operand. 03865 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { 03866 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 03867 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 03868 if (VT == MVT::v2f64 || VT == MVT::v2i64) 03869 return (Mask[0] < 2 && Mask[1] < 2); 03870 return false; 03871 } 03872 03873 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 03874 /// is suitable for input to PSHUFHW. 03875 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 03876 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 03877 return false; 03878 03879 // Lower quadword copied in order or undef. 03880 if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) 03881 return false; 03882 03883 // Upper quadword shuffled. 03884 for (unsigned i = 4; i != 8; ++i) 03885 if (!isUndefOrInRange(Mask[i], 4, 8)) 03886 return false; 03887 03888 if (VT == MVT::v16i16) { 03889 // Lower quadword copied in order or undef. 03890 if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) 03891 return false; 03892 03893 // Upper quadword shuffled. 03894 for (unsigned i = 12; i != 16; ++i) 03895 if (!isUndefOrInRange(Mask[i], 12, 16)) 03896 return false; 03897 } 03898 03899 return true; 03900 } 03901 03902 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 03903 /// is suitable for input to PSHUFLW. 03904 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 03905 if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) 03906 return false; 03907 03908 // Upper quadword copied in order. 03909 if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) 03910 return false; 03911 03912 // Lower quadword shuffled. 03913 for (unsigned i = 0; i != 4; ++i) 03914 if (!isUndefOrInRange(Mask[i], 0, 4)) 03915 return false; 03916 03917 if (VT == MVT::v16i16) { 03918 // Upper quadword copied in order. 03919 if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) 03920 return false; 03921 03922 // Lower quadword shuffled. 03923 for (unsigned i = 8; i != 12; ++i) 03924 if (!isUndefOrInRange(Mask[i], 8, 12)) 03925 return false; 03926 } 03927 03928 return true; 03929 } 03930 03931 /// \brief Return true if the mask specifies a shuffle of elements that is 03932 /// suitable for input to intralane (palignr) or interlane (valign) vector 03933 /// right-shift. 03934 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) { 03935 unsigned NumElts = VT.getVectorNumElements(); 03936 unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; 03937 unsigned NumLaneElts = NumElts/NumLanes; 03938 03939 // Do not handle 64-bit element shuffles with palignr. 03940 if (NumLaneElts == 2) 03941 return false; 03942 03943 for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { 03944 unsigned i; 03945 for (i = 0; i != NumLaneElts; ++i) { 03946 if (Mask[i+l] >= 0) 03947 break; 03948 } 03949 03950 // Lane is all undef, go to next lane 03951 if (i == NumLaneElts) 03952 continue; 03953 03954 int Start = Mask[i+l]; 03955 03956 // Make sure its in this lane in one of the sources 03957 if (!isUndefOrInRange(Start, l, l+NumLaneElts) && 03958 !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) 03959 return false; 03960 03961 // If not lane 0, then we must match lane 0 03962 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) 03963 return false; 03964 03965 // Correct second source to be contiguous with first source 03966 if (Start >= (int)NumElts) 03967 Start -= NumElts - NumLaneElts; 03968 03969 // Make sure we're shifting in the right direction. 03970 if (Start <= (int)(i+l)) 03971 return false; 03972 03973 Start -= i; 03974 03975 // Check the rest of the elements to see if they are consecutive. 03976 for (++i; i != NumLaneElts; ++i) { 03977 int Idx = Mask[i+l]; 03978 03979 // Make sure its in this lane 03980 if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && 03981 !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) 03982 return false; 03983 03984 // If not lane 0, then we must match lane 0 03985 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) 03986 return false; 03987 03988 if (Idx >= (int)NumElts) 03989 Idx -= NumElts - NumLaneElts; 03990 03991 if (!isUndefOrEqual(Idx, Start+i)) 03992 return false; 03993 03994 } 03995 } 03996 03997 return true; 03998 } 03999 04000 /// \brief Return true if the node specifies a shuffle of elements that is 04001 /// suitable for input to PALIGNR. 04002 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, 04003 const X86Subtarget *Subtarget) { 04004 if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || 04005 (VT.is256BitVector() && !Subtarget->hasInt256()) || 04006 VT.is512BitVector()) 04007 // FIXME: Add AVX512BW. 04008 return false; 04009 04010 return isAlignrMask(Mask, VT, false); 04011 } 04012 04013 /// \brief Return true if the node specifies a shuffle of elements that is 04014 /// suitable for input to VALIGN. 04015 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT, 04016 const X86Subtarget *Subtarget) { 04017 // FIXME: Add AVX512VL. 04018 if (!VT.is512BitVector() || !Subtarget->hasAVX512()) 04019 return false; 04020 return isAlignrMask(Mask, VT, true); 04021 } 04022 04023 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 04024 /// the two vector operands have swapped position. 04025 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, 04026 unsigned NumElems) { 04027 for (unsigned i = 0; i != NumElems; ++i) { 04028 int idx = Mask[i]; 04029 if (idx < 0) 04030 continue; 04031 else if (idx < (int)NumElems) 04032 Mask[i] = idx + NumElems; 04033 else 04034 Mask[i] = idx - NumElems; 04035 } 04036 } 04037 04038 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 04039 /// specifies a shuffle of elements that is suitable for input to 128/256-bit 04040 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be 04041 /// reverse of what x86 shuffles want. 04042 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { 04043 04044 unsigned NumElems = VT.getVectorNumElements(); 04045 unsigned NumLanes = VT.getSizeInBits()/128; 04046 unsigned NumLaneElems = NumElems/NumLanes; 04047 04048 if (NumLaneElems != 2 && NumLaneElems != 4) 04049 return false; 04050 04051 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 04052 bool symetricMaskRequired = 04053 (VT.getSizeInBits() >= 256) && (EltSize == 32); 04054 04055 // VSHUFPSY divides the resulting vector into 4 chunks. 04056 // The sources are also splitted into 4 chunks, and each destination 04057 // chunk must come from a different source chunk. 04058 // 04059 // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 04060 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 04061 // 04062 // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, 04063 // Y3..Y0, Y3..Y0, X3..X0, X3..X0 04064 // 04065 // VSHUFPDY divides the resulting vector into 4 chunks. 04066 // The sources are also splitted into 4 chunks, and each destination 04067 // chunk must come from a different source chunk. 04068 // 04069 // SRC1 => X3 X2 X1 X0 04070 // SRC2 => Y3 Y2 Y1 Y0 04071 // 04072 // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 04073 // 04074 SmallVector<int, 4> MaskVal(NumLaneElems, -1); 04075 unsigned HalfLaneElems = NumLaneElems/2; 04076 for (unsigned l = 0; l != NumElems; l += NumLaneElems) { 04077 for (unsigned i = 0; i != NumLaneElems; ++i) { 04078 int Idx = Mask[i+l]; 04079 unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); 04080 if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) 04081 return false; 04082 // For VSHUFPSY, the mask of the second half must be the same as the 04083 // first but with the appropriate offsets. This works in the same way as 04084 // VPERMILPS works with masks. 04085 if (!symetricMaskRequired || Idx < 0) 04086 continue; 04087 if (MaskVal[i] < 0) { 04088 MaskVal[i] = Idx - l; 04089 continue; 04090 } 04091 if ((signed)(Idx - l) != MaskVal[i]) 04092 return false; 04093 } 04094 } 04095 04096 return true; 04097 } 04098 04099 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 04100 /// specifies a shuffle of elements that is suitable for input to MOVHLPS. 04101 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { 04102 if (!VT.is128BitVector()) 04103 return false; 04104 04105 unsigned NumElems = VT.getVectorNumElements(); 04106 04107 if (NumElems != 4) 04108 return false; 04109 04110 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 04111 return isUndefOrEqual(Mask[0], 6) && 04112 isUndefOrEqual(Mask[1], 7) && 04113 isUndefOrEqual(Mask[2], 2) && 04114 isUndefOrEqual(Mask[3], 3); 04115 } 04116 04117 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 04118 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 04119 /// <2, 3, 2, 3> 04120 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { 04121 if (!VT.is128BitVector()) 04122 return false; 04123 04124 unsigned NumElems = VT.getVectorNumElements(); 04125 04126 if (NumElems != 4) 04127 return false; 04128 04129 return isUndefOrEqual(Mask[0], 2) && 04130 isUndefOrEqual(Mask[1], 3) && 04131 isUndefOrEqual(Mask[2], 2) && 04132 isUndefOrEqual(Mask[3], 3); 04133 } 04134 04135 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 04136 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 04137 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { 04138 if (!VT.is128BitVector()) 04139 return false; 04140 04141 unsigned NumElems = VT.getVectorNumElements(); 04142 04143 if (NumElems != 2 && NumElems != 4) 04144 return false; 04145 04146 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 04147 if (!isUndefOrEqual(Mask[i], i + NumElems)) 04148 return false; 04149 04150 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 04151 if (!isUndefOrEqual(Mask[i], i)) 04152 return false; 04153 04154 return true; 04155 } 04156 04157 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 04158 /// specifies a shuffle of elements that is suitable for input to MOVLHPS. 04159 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { 04160 if (!VT.is128BitVector()) 04161 return false; 04162 04163 unsigned NumElems = VT.getVectorNumElements(); 04164 04165 if (NumElems != 2 && NumElems != 4) 04166 return false; 04167 04168 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 04169 if (!isUndefOrEqual(Mask[i], i)) 04170 return false; 04171 04172 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 04173 if (!isUndefOrEqual(Mask[i + e], i + NumElems)) 04174 return false; 04175 04176 return true; 04177 } 04178 04179 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand 04180 /// specifies a shuffle of elements that is suitable for input to INSERTPS. 04181 /// i. e: If all but one element come from the same vector. 04182 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { 04183 // TODO: Deal with AVX's VINSERTPS 04184 if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) 04185 return false; 04186 04187 unsigned CorrectPosV1 = 0; 04188 unsigned CorrectPosV2 = 0; 04189 for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { 04190 if (Mask[i] == -1) { 04191 ++CorrectPosV1; 04192 ++CorrectPosV2; 04193 continue; 04194 } 04195 04196 if (Mask[i] == i) 04197 ++CorrectPosV1; 04198 else if (Mask[i] == i + 4) 04199 ++CorrectPosV2; 04200 } 04201 04202 if (CorrectPosV1 == 3 || CorrectPosV2 == 3) 04203 // We have 3 elements (undefs count as elements from any vector) from one 04204 // vector, and one from another. 04205 return true; 04206 04207 return false; 04208 } 04209 04210 // 04211 // Some special combinations that can be optimized. 04212 // 04213 static 04214 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, 04215 SelectionDAG &DAG) { 04216 MVT VT = SVOp->getSimpleValueType(0); 04217 SDLoc dl(SVOp); 04218 04219 if (VT != MVT::v8i32 && VT != MVT::v8f32) 04220 return SDValue(); 04221 04222 ArrayRef<int> Mask = SVOp->getMask(); 04223 04224 // These are the special masks that may be optimized. 04225 static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; 04226 static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; 04227 bool MatchEvenMask = true; 04228 bool MatchOddMask = true; 04229 for (int i=0; i<8; ++i) { 04230 if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) 04231 MatchEvenMask = false; 04232 if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) 04233 MatchOddMask = false; 04234 } 04235 04236 if (!MatchEvenMask && !MatchOddMask) 04237 return SDValue(); 04238 04239 SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); 04240 04241 SDValue Op0 = SVOp->getOperand(0); 04242 SDValue Op1 = SVOp->getOperand(1); 04243 04244 if (MatchEvenMask) { 04245 // Shift the second operand right to 32 bits. 04246 static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; 04247 Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); 04248 } else { 04249 // Shift the first operand left to 32 bits. 04250 static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; 04251 Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); 04252 } 04253 static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; 04254 return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); 04255 } 04256 04257 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 04258 /// specifies a shuffle of elements that is suitable for input to UNPCKL. 04259 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, 04260 bool HasInt256, bool V2IsSplat = false) { 04261 04262 assert(VT.getSizeInBits() >= 128 && 04263 "Unsupported vector type for unpckl"); 04264 04265 unsigned NumElts = VT.getVectorNumElements(); 04266 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 04267 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 04268 return false; 04269 04270 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && 04271 "Unsupported vector type for unpckh"); 04272 04273 // AVX defines UNPCK* to operate independently on 128-bit lanes. 04274 unsigned NumLanes = VT.getSizeInBits()/128; 04275 unsigned NumLaneElts = NumElts/NumLanes; 04276 04277 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 04278 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 04279 int BitI = Mask[l+i]; 04280 int BitI1 = Mask[l+i+1]; 04281 if (!isUndefOrEqual(BitI, j)) 04282 return false; 04283 if (V2IsSplat) { 04284 if (!isUndefOrEqual(BitI1, NumElts)) 04285 return false; 04286 } else { 04287 if (!isUndefOrEqual(BitI1, j + NumElts)) 04288 return false; 04289 } 04290 } 04291 } 04292 04293 return true; 04294 } 04295 04296 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 04297 /// specifies a shuffle of elements that is suitable for input to UNPCKH. 04298 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, 04299 bool HasInt256, bool V2IsSplat = false) { 04300 assert(VT.getSizeInBits() >= 128 && 04301 "Unsupported vector type for unpckh"); 04302 04303 unsigned NumElts = VT.getVectorNumElements(); 04304 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 04305 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 04306 return false; 04307 04308 assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && 04309 "Unsupported vector type for unpckh"); 04310 04311 // AVX defines UNPCK* to operate independently on 128-bit lanes. 04312 unsigned NumLanes = VT.getSizeInBits()/128; 04313 unsigned NumLaneElts = NumElts/NumLanes; 04314 04315 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 04316 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 04317 int BitI = Mask[l+i]; 04318 int BitI1 = Mask[l+i+1]; 04319 if (!isUndefOrEqual(BitI, j)) 04320 return false; 04321 if (V2IsSplat) { 04322 if (isUndefOrEqual(BitI1, NumElts)) 04323 return false; 04324 } else { 04325 if (!isUndefOrEqual(BitI1, j+NumElts)) 04326 return false; 04327 } 04328 } 04329 } 04330 return true; 04331 } 04332 04333 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 04334 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 04335 /// <0, 0, 1, 1> 04336 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 04337 unsigned NumElts = VT.getVectorNumElements(); 04338 bool Is256BitVec = VT.is256BitVector(); 04339 04340 if (VT.is512BitVector()) 04341 return false; 04342 assert((VT.is128BitVector() || VT.is256BitVector()) && 04343 "Unsupported vector type for unpckh"); 04344 04345 if (Is256BitVec && NumElts != 4 && NumElts != 8 && 04346 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 04347 return false; 04348 04349 // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern 04350 // FIXME: Need a better way to get rid of this, there's no latency difference 04351 // between UNPCKLPD and MOVDDUP, the later should always be checked first and 04352 // the former later. We should also remove the "_undef" special mask. 04353 if (NumElts == 4 && Is256BitVec) 04354 return false; 04355 04356 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 04357 // independently on 128-bit lanes. 04358 unsigned NumLanes = VT.getSizeInBits()/128; 04359 unsigned NumLaneElts = NumElts/NumLanes; 04360 04361 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 04362 for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { 04363 int BitI = Mask[l+i]; 04364 int BitI1 = Mask[l+i+1]; 04365 04366 if (!isUndefOrEqual(BitI, j)) 04367 return false; 04368 if (!isUndefOrEqual(BitI1, j)) 04369 return false; 04370 } 04371 } 04372 04373 return true; 04374 } 04375 04376 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 04377 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 04378 /// <2, 2, 3, 3> 04379 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { 04380 unsigned NumElts = VT.getVectorNumElements(); 04381 04382 if (VT.is512BitVector()) 04383 return false; 04384 04385 assert((VT.is128BitVector() || VT.is256BitVector()) && 04386 "Unsupported vector type for unpckh"); 04387 04388 if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && 04389 (!HasInt256 || (NumElts != 16 && NumElts != 32))) 04390 return false; 04391 04392 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 04393 // independently on 128-bit lanes. 04394 unsigned NumLanes = VT.getSizeInBits()/128; 04395 unsigned NumLaneElts = NumElts/NumLanes; 04396 04397 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 04398 for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { 04399 int BitI = Mask[l+i]; 04400 int BitI1 = Mask[l+i+1]; 04401 if (!isUndefOrEqual(BitI, j)) 04402 return false; 04403 if (!isUndefOrEqual(BitI1, j)) 04404 return false; 04405 } 04406 } 04407 return true; 04408 } 04409 04410 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or 04411 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors 04412 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) { 04413 if (!VT.is512BitVector()) 04414 return false; 04415 04416 unsigned NumElts = VT.getVectorNumElements(); 04417 unsigned HalfSize = NumElts/2; 04418 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { 04419 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { 04420 *Imm = 1; 04421 return true; 04422 } 04423 } 04424 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { 04425 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { 04426 *Imm = 0; 04427 return true; 04428 } 04429 } 04430 return false; 04431 } 04432 04433 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 04434 /// specifies a shuffle of elements that is suitable for input to MOVSS, 04435 /// MOVSD, and MOVD, i.e. setting the lowest element. 04436 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { 04437 if (VT.getVectorElementType().getSizeInBits() < 32) 04438 return false; 04439 if (!VT.is128BitVector()) 04440 return false; 04441 04442 unsigned NumElts = VT.getVectorNumElements(); 04443 04444 if (!isUndefOrEqual(Mask[0], NumElts)) 04445 return false; 04446 04447 for (unsigned i = 1; i != NumElts; ++i) 04448 if (!isUndefOrEqual(Mask[i], i)) 04449 return false; 04450 04451 return true; 04452 } 04453 04454 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered 04455 /// as permutations between 128-bit chunks or halves. As an example: this 04456 /// shuffle bellow: 04457 /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> 04458 /// The first half comes from the second half of V1 and the second half from the 04459 /// the second half of V2. 04460 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 04461 if (!HasFp256 || !VT.is256BitVector()) 04462 return false; 04463 04464 // The shuffle result is divided into half A and half B. In total the two 04465 // sources have 4 halves, namely: C, D, E, F. The final values of A and 04466 // B must come from C, D, E or F. 04467 unsigned HalfSize = VT.getVectorNumElements()/2; 04468 bool MatchA = false, MatchB = false; 04469 04470 // Check if A comes from one of C, D, E, F. 04471 for (unsigned Half = 0; Half != 4; ++Half) { 04472 if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { 04473 MatchA = true; 04474 break; 04475 } 04476 } 04477 04478 // Check if B comes from one of C, D, E, F. 04479 for (unsigned Half = 0; Half != 4; ++Half) { 04480 if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { 04481 MatchB = true; 04482 break; 04483 } 04484 } 04485 04486 return MatchA && MatchB; 04487 } 04488 04489 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle 04490 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. 04491 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { 04492 MVT VT = SVOp->getSimpleValueType(0); 04493 04494 unsigned HalfSize = VT.getVectorNumElements()/2; 04495 04496 unsigned FstHalf = 0, SndHalf = 0; 04497 for (unsigned i = 0; i < HalfSize; ++i) { 04498 if (SVOp->getMaskElt(i) > 0) { 04499 FstHalf = SVOp->getMaskElt(i)/HalfSize; 04500 break; 04501 } 04502 } 04503 for (unsigned i = HalfSize; i < HalfSize*2; ++i) { 04504 if (SVOp->getMaskElt(i) > 0) { 04505 SndHalf = SVOp->getMaskElt(i)/HalfSize; 04506 break; 04507 } 04508 } 04509 04510 return (FstHalf | (SndHalf << 4)); 04511 } 04512 04513 // Symetric in-lane mask. Each lane has 4 elements (for imm8) 04514 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { 04515 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 04516 if (EltSize < 32) 04517 return false; 04518 04519 unsigned NumElts = VT.getVectorNumElements(); 04520 Imm8 = 0; 04521 if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { 04522 for (unsigned i = 0; i != NumElts; ++i) { 04523 if (Mask[i] < 0) 04524 continue; 04525 Imm8 |= Mask[i] << (i*2); 04526 } 04527 return true; 04528 } 04529 04530 unsigned LaneSize = 4; 04531 SmallVector<int, 4> MaskVal(LaneSize, -1); 04532 04533 for (unsigned l = 0; l != NumElts; l += LaneSize) { 04534 for (unsigned i = 0; i != LaneSize; ++i) { 04535 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 04536 return false; 04537 if (Mask[i+l] < 0) 04538 continue; 04539 if (MaskVal[i] < 0) { 04540 MaskVal[i] = Mask[i+l] - l; 04541 Imm8 |= MaskVal[i] << (i*2); 04542 continue; 04543 } 04544 if (Mask[i+l] != (signed)(MaskVal[i]+l)) 04545 return false; 04546 } 04547 } 04548 return true; 04549 } 04550 04551 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand 04552 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 04553 /// Note that VPERMIL mask matching is different depending whether theunderlying 04554 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point 04555 /// to the same elements of the low, but to the higher half of the source. 04556 /// In VPERMILPD the two lanes could be shuffled independently of each other 04557 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. 04558 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { 04559 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 04560 if (VT.getSizeInBits() < 256 || EltSize < 32) 04561 return false; 04562 bool symetricMaskRequired = (EltSize == 32); 04563 unsigned NumElts = VT.getVectorNumElements(); 04564 04565 unsigned NumLanes = VT.getSizeInBits()/128; 04566 unsigned LaneSize = NumElts/NumLanes; 04567 // 2 or 4 elements in one lane 04568 04569 SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); 04570 for (unsigned l = 0; l != NumElts; l += LaneSize) { 04571 for (unsigned i = 0; i != LaneSize; ++i) { 04572 if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) 04573 return false; 04574 if (symetricMaskRequired) { 04575 if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { 04576 ExpectedMaskVal[i] = Mask[i+l] - l; 04577 continue; 04578 } 04579 if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) 04580 return false; 04581 } 04582 } 04583 } 04584 return true; 04585 } 04586 04587 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse 04588 /// of what x86 movss want. X86 movs requires the lowest element to be lowest 04589 /// element of vector 2 and the other elements to come from vector 1 in order. 04590 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, 04591 bool V2IsSplat = false, bool V2IsUndef = false) { 04592 if (!VT.is128BitVector()) 04593 return false; 04594 04595 unsigned NumOps = VT.getVectorNumElements(); 04596 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 04597 return false; 04598 04599 if (!isUndefOrEqual(Mask[0], 0)) 04600 return false; 04601 04602 for (unsigned i = 1; i != NumOps; ++i) 04603 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 04604 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 04605 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 04606 return false; 04607 04608 return true; 04609 } 04610 04611 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 04612 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 04613 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 04614 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, 04615 const X86Subtarget *Subtarget) { 04616 if (!Subtarget->hasSSE3()) 04617 return false; 04618 04619 unsigned NumElems = VT.getVectorNumElements(); 04620 04621 if ((VT.is128BitVector() && NumElems != 4) || 04622 (VT.is256BitVector() && NumElems != 8) || 04623 (VT.is512BitVector() && NumElems != 16)) 04624 return false; 04625 04626 // "i+1" is the value the indexed mask element must have 04627 for (unsigned i = 0; i != NumElems; i += 2) 04628 if (!isUndefOrEqual(Mask[i], i+1) || 04629 !isUndefOrEqual(Mask[i+1], i+1)) 04630 return false; 04631 04632 return true; 04633 } 04634 04635 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 04636 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 04637 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 04638 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, 04639 const X86Subtarget *Subtarget) { 04640 if (!Subtarget->hasSSE3()) 04641 return false; 04642 04643 unsigned NumElems = VT.getVectorNumElements(); 04644 04645 if ((VT.is128BitVector() && NumElems != 4) || 04646 (VT.is256BitVector() && NumElems != 8) || 04647 (VT.is512BitVector() && NumElems != 16)) 04648 return false; 04649 04650 // "i" is the value the indexed mask element must have 04651 for (unsigned i = 0; i != NumElems; i += 2) 04652 if (!isUndefOrEqual(Mask[i], i) || 04653 !isUndefOrEqual(Mask[i+1], i)) 04654 return false; 04655 04656 return true; 04657 } 04658 04659 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand 04660 /// specifies a shuffle of elements that is suitable for input to 256-bit 04661 /// version of MOVDDUP. 04662 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { 04663 if (!HasFp256 || !VT.is256BitVector()) 04664 return false; 04665 04666 unsigned NumElts = VT.getVectorNumElements(); 04667 if (NumElts != 4) 04668 return false; 04669 04670 for (unsigned i = 0; i != NumElts/2; ++i) 04671 if (!isUndefOrEqual(Mask[i], 0)) 04672 return false; 04673 for (unsigned i = NumElts/2; i != NumElts; ++i) 04674 if (!isUndefOrEqual(Mask[i], NumElts/2)) 04675 return false; 04676 return true; 04677 } 04678 04679 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 04680 /// specifies a shuffle of elements that is suitable for input to 128-bit 04681 /// version of MOVDDUP. 04682 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { 04683 if (!VT.is128BitVector()) 04684 return false; 04685 04686 unsigned e = VT.getVectorNumElements() / 2; 04687 for (unsigned i = 0; i != e; ++i) 04688 if (!isUndefOrEqual(Mask[i], i)) 04689 return false; 04690 for (unsigned i = 0; i != e; ++i) 04691 if (!isUndefOrEqual(Mask[e+i], i)) 04692 return false; 04693 return true; 04694 } 04695 04696 /// isVEXTRACTIndex - Return true if the specified 04697 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is 04698 /// suitable for instruction that extract 128 or 256 bit vectors 04699 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { 04700 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 04701 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 04702 return false; 04703 04704 // The index should be aligned on a vecWidth-bit boundary. 04705 uint64_t Index = 04706 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 04707 04708 MVT VT = N->getSimpleValueType(0); 04709 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 04710 bool Result = (Index * ElSize) % vecWidth == 0; 04711 04712 return Result; 04713 } 04714 04715 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR 04716 /// operand specifies a subvector insert that is suitable for input to 04717 /// insertion of 128 or 256-bit subvectors 04718 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { 04719 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); 04720 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 04721 return false; 04722 // The index should be aligned on a vecWidth-bit boundary. 04723 uint64_t Index = 04724 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 04725 04726 MVT VT = N->getSimpleValueType(0); 04727 unsigned ElSize = VT.getVectorElementType().getSizeInBits(); 04728 bool Result = (Index * ElSize) % vecWidth == 0; 04729 04730 return Result; 04731 } 04732 04733 bool X86::isVINSERT128Index(SDNode *N) { 04734 return isVINSERTIndex(N, 128); 04735 } 04736 04737 bool X86::isVINSERT256Index(SDNode *N) { 04738 return isVINSERTIndex(N, 256); 04739 } 04740 04741 bool X86::isVEXTRACT128Index(SDNode *N) { 04742 return isVEXTRACTIndex(N, 128); 04743 } 04744 04745 bool X86::isVEXTRACT256Index(SDNode *N) { 04746 return isVEXTRACTIndex(N, 256); 04747 } 04748 04749 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 04750 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 04751 /// Handles 128-bit and 256-bit. 04752 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { 04753 MVT VT = N->getSimpleValueType(0); 04754 04755 assert((VT.getSizeInBits() >= 128) && 04756 "Unsupported vector type for PSHUF/SHUFP"); 04757 04758 // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate 04759 // independently on 128-bit lanes. 04760 unsigned NumElts = VT.getVectorNumElements(); 04761 unsigned NumLanes = VT.getSizeInBits()/128; 04762 unsigned NumLaneElts = NumElts/NumLanes; 04763 04764 assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && 04765 "Only supports 2, 4 or 8 elements per lane"); 04766 04767 unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; 04768 unsigned Mask = 0; 04769 for (unsigned i = 0; i != NumElts; ++i) { 04770 int Elt = N->getMaskElt(i); 04771 if (Elt < 0) continue; 04772 Elt &= NumLaneElts - 1; 04773 unsigned ShAmt = (i << Shift) % 8; 04774 Mask |= Elt << ShAmt; 04775 } 04776 04777 return Mask; 04778 } 04779 04780 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 04781 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 04782 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { 04783 MVT VT = N->getSimpleValueType(0); 04784 04785 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 04786 "Unsupported vector type for PSHUFHW"); 04787 04788 unsigned NumElts = VT.getVectorNumElements(); 04789 04790 unsigned Mask = 0; 04791 for (unsigned l = 0; l != NumElts; l += 8) { 04792 // 8 nodes per lane, but we only care about the last 4. 04793 for (unsigned i = 0; i < 4; ++i) { 04794 int Elt = N->getMaskElt(l+i+4); 04795 if (Elt < 0) continue; 04796 Elt &= 0x3; // only 2-bits. 04797 Mask |= Elt << (i * 2); 04798 } 04799 } 04800 04801 return Mask; 04802 } 04803 04804 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 04805 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 04806 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { 04807 MVT VT = N->getSimpleValueType(0); 04808 04809 assert((VT == MVT::v8i16 || VT == MVT::v16i16) && 04810 "Unsupported vector type for PSHUFHW"); 04811 04812 unsigned NumElts = VT.getVectorNumElements(); 04813 04814 unsigned Mask = 0; 04815 for (unsigned l = 0; l != NumElts; l += 8) { 04816 // 8 nodes per lane, but we only care about the first 4. 04817 for (unsigned i = 0; i < 4; ++i) { 04818 int Elt = N->getMaskElt(l+i); 04819 if (Elt < 0) continue; 04820 Elt &= 0x3; // only 2-bits 04821 Mask |= Elt << (i * 2); 04822 } 04823 } 04824 04825 return Mask; 04826 } 04827 04828 /// \brief Return the appropriate immediate to shuffle the specified 04829 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with 04830 /// VALIGN (if Interlane is true) instructions. 04831 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, 04832 bool InterLane) { 04833 MVT VT = SVOp->getSimpleValueType(0); 04834 unsigned EltSize = InterLane ? 1 : 04835 VT.getVectorElementType().getSizeInBits() >> 3; 04836 04837 unsigned NumElts = VT.getVectorNumElements(); 04838 unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; 04839 unsigned NumLaneElts = NumElts/NumLanes; 04840 04841 int Val = 0; 04842 unsigned i; 04843 for (i = 0; i != NumElts; ++i) { 04844 Val = SVOp->getMaskElt(i); 04845 if (Val >= 0) 04846 break; 04847 } 04848 if (Val >= (int)NumElts) 04849 Val -= NumElts - NumLaneElts; 04850 04851 assert(Val - i > 0 && "PALIGNR imm should be positive"); 04852 return (Val - i) * EltSize; 04853 } 04854 04855 /// \brief Return the appropriate immediate to shuffle the specified 04856 /// VECTOR_SHUFFLE mask with the PALIGNR instruction. 04857 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { 04858 return getShuffleAlignrImmediate(SVOp, false); 04859 } 04860 04861 /// \brief Return the appropriate immediate to shuffle the specified 04862 /// VECTOR_SHUFFLE mask with the VALIGN instruction. 04863 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { 04864 return getShuffleAlignrImmediate(SVOp, true); 04865 } 04866 04867 04868 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { 04869 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 04870 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 04871 llvm_unreachable("Illegal extract subvector for VEXTRACT"); 04872 04873 uint64_t Index = 04874 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 04875 04876 MVT VecVT = N->getOperand(0).getSimpleValueType(); 04877 MVT ElVT = VecVT.getVectorElementType(); 04878 04879 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 04880 return Index / NumElemsPerChunk; 04881 } 04882 04883 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { 04884 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); 04885 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 04886 llvm_unreachable("Illegal insert subvector for VINSERT"); 04887 04888 uint64_t Index = 04889 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 04890 04891 MVT VecVT = N->getSimpleValueType(0); 04892 MVT ElVT = VecVT.getVectorElementType(); 04893 04894 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); 04895 return Index / NumElemsPerChunk; 04896 } 04897 04898 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate 04899 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 04900 /// and VINSERTI128 instructions. 04901 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { 04902 return getExtractVEXTRACTImmediate(N, 128); 04903 } 04904 04905 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate 04906 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 04907 /// and VINSERTI64x4 instructions. 04908 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { 04909 return getExtractVEXTRACTImmediate(N, 256); 04910 } 04911 04912 /// getInsertVINSERT128Immediate - Return the appropriate immediate 04913 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 04914 /// and VINSERTI128 instructions. 04915 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { 04916 return getInsertVINSERTImmediate(N, 128); 04917 } 04918 04919 /// getInsertVINSERT256Immediate - Return the appropriate immediate 04920 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 04921 /// and VINSERTI64x4 instructions. 04922 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { 04923 return getInsertVINSERTImmediate(N, 256); 04924 } 04925 04926 /// isZero - Returns true if Elt is a constant integer zero 04927 static bool isZero(SDValue V) { 04928 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 04929 return C && C->isNullValue(); 04930 } 04931 04932 /// isZeroNode - Returns true if Elt is a constant zero or a floating point 04933 /// constant +0.0. 04934 bool X86::isZeroNode(SDValue Elt) { 04935 if (isZero(Elt)) 04936 return true; 04937 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) 04938 return CFP->getValueAPF().isPosZero(); 04939 return false; 04940 } 04941 04942 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to 04943 /// match movhlps. The lower half elements should come from upper half of 04944 /// V1 (and in order), and the upper half elements should come from the upper 04945 /// half of V2 (and in order). 04946 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { 04947 if (!VT.is128BitVector()) 04948 return false; 04949 if (VT.getVectorNumElements() != 4) 04950 return false; 04951 for (unsigned i = 0, e = 2; i != e; ++i) 04952 if (!isUndefOrEqual(Mask[i], i+2)) 04953 return false; 04954 for (unsigned i = 2; i != 4; ++i) 04955 if (!isUndefOrEqual(Mask[i], i+4)) 04956 return false; 04957 return true; 04958 } 04959 04960 /// isScalarLoadToVector - Returns true if the node is a scalar load that 04961 /// is promoted to a vector. It also returns the LoadSDNode by reference if 04962 /// required. 04963 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { 04964 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 04965 return false; 04966 N = N->getOperand(0).getNode(); 04967 if (!ISD::isNON_EXTLoad(N)) 04968 return false; 04969 if (LD) 04970 *LD = cast<LoadSDNode>(N); 04971 return true; 04972 } 04973 04974 // Test whether the given value is a vector value which will be legalized 04975 // into a load. 04976 static bool WillBeConstantPoolLoad(SDNode *N) { 04977 if (N->getOpcode() != ISD::BUILD_VECTOR) 04978 return false; 04979 04980 // Check for any non-constant elements. 04981 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) 04982 switch (N->getOperand(i).getNode()->getOpcode()) { 04983 case ISD::UNDEF: 04984 case ISD::ConstantFP: 04985 case ISD::Constant: 04986 break; 04987 default: 04988 return false; 04989 } 04990 04991 // Vectors of all-zeros and all-ones are materialized with special 04992 // instructions rather than being loaded. 04993 return !ISD::isBuildVectorAllZeros(N) && 04994 !ISD::isBuildVectorAllOnes(N); 04995 } 04996 04997 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 04998 /// match movlp{s|d}. The lower half elements should come from lower half of 04999 /// V1 (and in order), and the upper half elements should come from the upper 05000 /// half of V2 (and in order). And since V1 will become the source of the 05001 /// MOVLP, it must be either a vector load or a scalar load to vector. 05002 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 05003 ArrayRef<int> Mask, MVT VT) { 05004 if (!VT.is128BitVector()) 05005 return false; 05006 05007 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 05008 return false; 05009 // Is V2 is a vector load, don't do this transformation. We will try to use 05010 // load folding shufps op. 05011 if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) 05012 return false; 05013 05014 unsigned NumElems = VT.getVectorNumElements(); 05015 05016 if (NumElems != 2 && NumElems != 4) 05017 return false; 05018 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 05019 if (!isUndefOrEqual(Mask[i], i)) 05020 return false; 05021 for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) 05022 if (!isUndefOrEqual(Mask[i], i+NumElems)) 05023 return false; 05024 return true; 05025 } 05026 05027 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 05028 /// to an zero vector. 05029 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode 05030 static bool isZeroShuffle(ShuffleVectorSDNode *N) { 05031 SDValue V1 = N->getOperand(0); 05032 SDValue V2 = N->getOperand(1); 05033 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 05034 for (unsigned i = 0; i != NumElems; ++i) { 05035 int Idx = N->getMaskElt(i); 05036 if (Idx >= (int)NumElems) { 05037 unsigned Opc = V2.getOpcode(); 05038 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 05039 continue; 05040 if (Opc != ISD::BUILD_VECTOR || 05041 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 05042 return false; 05043 } else if (Idx >= 0) { 05044 unsigned Opc = V1.getOpcode(); 05045 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 05046 continue; 05047 if (Opc != ISD::BUILD_VECTOR || 05048 !X86::isZeroNode(V1.getOperand(Idx))) 05049 return false; 05050 } 05051 } 05052 return true; 05053 } 05054 05055 /// getZeroVector - Returns a vector of specified type with all zero elements. 05056 /// 05057 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, 05058 SelectionDAG &DAG, SDLoc dl) { 05059 assert(VT.isVector() && "Expected a vector type"); 05060 05061 // Always build SSE zero vectors as <4 x i32> bitcasted 05062 // to their dest type. This ensures they get CSE'd. 05063 SDValue Vec; 05064 if (VT.is128BitVector()) { // SSE 05065 if (Subtarget->hasSSE2()) { // SSE2 05066 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 05067 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 05068 } else { // SSE1 05069 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 05070 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 05071 } 05072 } else if (VT.is256BitVector()) { // AVX 05073 if (Subtarget->hasInt256()) { // AVX2 05074 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 05075 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 05076 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); 05077 } else { 05078 // 256-bit logic and arithmetic instructions in AVX are all 05079 // floating-point, no support for integer ops. Emit fp zeroed vectors. 05080 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 05081 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 05082 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); 05083 } 05084 } else if (VT.is512BitVector()) { // AVX-512 05085 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 05086 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, 05087 Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 05088 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); 05089 } else if (VT.getScalarType() == MVT::i1) { 05090 assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); 05091 SDValue Cst = DAG.getTargetConstant(0, MVT::i1); 05092 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 05093 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 05094 } else 05095 llvm_unreachable("Unexpected vector type"); 05096 05097 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 05098 } 05099 05100 /// getOnesVector - Returns a vector of specified type with all bits set. 05101 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with 05102 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. 05103 /// Then bitcast to their original type, ensuring they get CSE'd. 05104 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, 05105 SDLoc dl) { 05106 assert(VT.isVector() && "Expected a vector type"); 05107 05108 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 05109 SDValue Vec; 05110 if (VT.is256BitVector()) { 05111 if (HasInt256) { // AVX2 05112 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 05113 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); 05114 } else { // AVX 05115 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 05116 Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); 05117 } 05118 } else if (VT.is128BitVector()) { 05119 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 05120 } else 05121 llvm_unreachable("Unexpected vector type"); 05122 05123 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 05124 } 05125 05126 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 05127 /// that point to V2 points to its first element. 05128 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { 05129 for (unsigned i = 0; i != NumElems; ++i) { 05130 if (Mask[i] > (int)NumElems) { 05131 Mask[i] = NumElems; 05132 } 05133 } 05134 } 05135 05136 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 05137 /// operation of specified width. 05138 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, 05139 SDValue V2) { 05140 unsigned NumElems = VT.getVectorNumElements(); 05141 SmallVector<int, 8> Mask; 05142 Mask.push_back(NumElems); 05143 for (unsigned i = 1; i != NumElems; ++i) 05144 Mask.push_back(i); 05145 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 05146 } 05147 05148 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 05149 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 05150 SDValue V2) { 05151 unsigned NumElems = VT.getVectorNumElements(); 05152 SmallVector<int, 8> Mask; 05153 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 05154 Mask.push_back(i); 05155 Mask.push_back(i + NumElems); 05156 } 05157 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 05158 } 05159 05160 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 05161 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, 05162 SDValue V2) { 05163 unsigned NumElems = VT.getVectorNumElements(); 05164 SmallVector<int, 8> Mask; 05165 for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { 05166 Mask.push_back(i + Half); 05167 Mask.push_back(i + NumElems + Half); 05168 } 05169 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 05170 } 05171 05172 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by 05173 // a generic shuffle instruction because the target has no such instructions. 05174 // Generate shuffles which repeat i16 and i8 several times until they can be 05175 // represented by v4f32 and then be manipulated by target suported shuffles. 05176 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { 05177 MVT VT = V.getSimpleValueType(); 05178 int NumElems = VT.getVectorNumElements(); 05179 SDLoc dl(V); 05180 05181 while (NumElems > 4) { 05182 if (EltNo < NumElems/2) { 05183 V = getUnpackl(DAG, dl, VT, V, V); 05184 } else { 05185 V = getUnpackh(DAG, dl, VT, V, V); 05186 EltNo -= NumElems/2; 05187 } 05188 NumElems >>= 1; 05189 } 05190 return V; 05191 } 05192 05193 /// getLegalSplat - Generate a legal splat with supported x86 shuffles 05194 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 05195 MVT VT = V.getSimpleValueType(); 05196 SDLoc dl(V); 05197 05198 if (VT.is128BitVector()) { 05199 V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); 05200 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 05201 V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), 05202 &SplatMask[0]); 05203 } else if (VT.is256BitVector()) { 05204 // To use VPERMILPS to splat scalars, the second half of indicies must 05205 // refer to the higher part, which is a duplication of the lower one, 05206 // because VPERMILPS can only handle in-lane permutations. 05207 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 05208 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 05209 05210 V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); 05211 V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), 05212 &SplatMask[0]); 05213 } else 05214 llvm_unreachable("Vector size not supported"); 05215 05216 return DAG.getNode(ISD::BITCAST, dl, VT, V); 05217 } 05218 05219 /// PromoteSplat - Splat is promoted to target supported vector shuffles. 05220 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 05221 MVT SrcVT = SV->getSimpleValueType(0); 05222 SDValue V1 = SV->getOperand(0); 05223 SDLoc dl(SV); 05224 05225 int EltNo = SV->getSplatIndex(); 05226 int NumElems = SrcVT.getVectorNumElements(); 05227 bool Is256BitVec = SrcVT.is256BitVector(); 05228 05229 assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && 05230 "Unknown how to promote splat for type"); 05231 05232 // Extract the 128-bit part containing the splat element and update 05233 // the splat element index when it refers to the higher register. 05234 if (Is256BitVec) { 05235 V1 = Extract128BitVector(V1, EltNo, DAG, dl); 05236 if (EltNo >= NumElems/2) 05237 EltNo -= NumElems/2; 05238 } 05239 05240 // All i16 and i8 vector types can't be used directly by a generic shuffle 05241 // instruction because the target has no such instruction. Generate shuffles 05242 // which repeat i16 and i8 several times until they fit in i32, and then can 05243 // be manipulated by target suported shuffles. 05244 MVT EltVT = SrcVT.getVectorElementType(); 05245 if (EltVT == MVT::i8 || EltVT == MVT::i16) 05246 V1 = PromoteSplati8i16(V1, DAG, EltNo); 05247 05248 // Recreate the 256-bit vector and place the same 128-bit vector 05249 // into the low and high part. This is necessary because we want 05250 // to use VPERM* to shuffle the vectors 05251 if (Is256BitVec) { 05252 V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); 05253 } 05254 05255 return getLegalSplat(DAG, V1, EltNo); 05256 } 05257 05258 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 05259 /// vector of zero or undef vector. This produces a shuffle where the low 05260 /// element of V2 is swizzled into the zero/undef vector, landing at element 05261 /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 05262 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 05263 bool IsZero, 05264 const X86Subtarget *Subtarget, 05265 SelectionDAG &DAG) { 05266 MVT VT = V2.getSimpleValueType(); 05267 SDValue V1 = IsZero 05268 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); 05269 unsigned NumElems = VT.getVectorNumElements(); 05270 SmallVector<int, 16> MaskVec; 05271 for (unsigned i = 0; i != NumElems; ++i) 05272 // If this is the insertion idx, put the low elt of V2 here. 05273 MaskVec.push_back(i == Idx ? NumElems : i); 05274 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); 05275 } 05276 05277 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the 05278 /// target specific opcode. Returns true if the Mask could be calculated. Sets 05279 /// IsUnary to true if only uses one source. Note that this will set IsUnary for 05280 /// shuffles which use a single input multiple times, and in those cases it will 05281 /// adjust the mask to only have indices within that single input. 05282 static bool getTargetShuffleMask(SDNode *N, MVT VT, 05283 SmallVectorImpl<int> &Mask, bool &IsUnary) { 05284 unsigned NumElems = VT.getVectorNumElements(); 05285 SDValue ImmN; 05286 05287 IsUnary = false; 05288 bool IsFakeUnary = false; 05289 switch(N->getOpcode()) { 05290 case X86ISD::SHUFP: 05291 ImmN = N->getOperand(N->getNumOperands()-1); 05292 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05293 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 05294 break; 05295 case X86ISD::UNPCKH: 05296 DecodeUNPCKHMask(VT, Mask); 05297 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 05298 break; 05299 case X86ISD::UNPCKL: 05300 DecodeUNPCKLMask(VT, Mask); 05301 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 05302 break; 05303 case X86ISD::MOVHLPS: 05304 DecodeMOVHLPSMask(NumElems, Mask); 05305 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 05306 break; 05307 case X86ISD::MOVLHPS: 05308 DecodeMOVLHPSMask(NumElems, Mask); 05309 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); 05310 break; 05311 case X86ISD::PALIGNR: 05312 ImmN = N->getOperand(N->getNumOperands()-1); 05313 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05314 break; 05315 case X86ISD::PSHUFD: 05316 case X86ISD::VPERMILP: 05317 ImmN = N->getOperand(N->getNumOperands()-1); 05318 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05319 IsUnary = true; 05320 break; 05321 case X86ISD::PSHUFHW: 05322 ImmN = N->getOperand(N->getNumOperands()-1); 05323 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05324 IsUnary = true; 05325 break; 05326 case X86ISD::PSHUFLW: 05327 ImmN = N->getOperand(N->getNumOperands()-1); 05328 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05329 IsUnary = true; 05330 break; 05331 case X86ISD::PSHUFB: { 05332 IsUnary = true; 05333 SDValue MaskNode = N->getOperand(1); 05334 while (MaskNode->getOpcode() == ISD::BITCAST) 05335 MaskNode = MaskNode->getOperand(0); 05336 05337 if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { 05338 // If we have a build-vector, then things are easy. 05339 EVT VT = MaskNode.getValueType(); 05340 assert(VT.isVector() && 05341 "Can't produce a non-vector with a build_vector!"); 05342 if (!VT.isInteger()) 05343 return false; 05344 05345 int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; 05346 05347 SmallVector<uint64_t, 32> RawMask; 05348 for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { 05349 auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i)); 05350 if (!CN) 05351 return false; 05352 APInt MaskElement = CN->getAPIntValue(); 05353 05354 // We now have to decode the element which could be any integer size and 05355 // extract each byte of it. 05356 for (int j = 0; j < NumBytesPerElement; ++j) { 05357 // Note that this is x86 and so always little endian: the low byte is 05358 // the first byte of the mask. 05359 RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); 05360 MaskElement = MaskElement.lshr(8); 05361 } 05362 } 05363 DecodePSHUFBMask(RawMask, Mask); 05364 break; 05365 } 05366 05367 auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); 05368 if (!MaskLoad) 05369 return false; 05370 05371 SDValue Ptr = MaskLoad->getBasePtr(); 05372 if (Ptr->getOpcode() == X86ISD::Wrapper) 05373 Ptr = Ptr->getOperand(0); 05374 05375 auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); 05376 if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) 05377 return false; 05378 05379 if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) { 05380 // FIXME: Support AVX-512 here. 05381 if (!C->getType()->isVectorTy() || 05382 (C->getNumElements() != 16 && C->getNumElements() != 32)) 05383 return false; 05384 05385 assert(C->getType()->isVectorTy() && "Expected a vector constant."); 05386 DecodePSHUFBMask(C, Mask); 05387 break; 05388 } 05389 05390 return false; 05391 } 05392 case X86ISD::VPERMI: 05393 ImmN = N->getOperand(N->getNumOperands()-1); 05394 DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05395 IsUnary = true; 05396 break; 05397 case X86ISD::MOVSS: 05398 case X86ISD::MOVSD: { 05399 // The index 0 always comes from the first element of the second source, 05400 // this is why MOVSS and MOVSD are used in the first place. The other 05401 // elements come from the other positions of the first source vector 05402 Mask.push_back(NumElems); 05403 for (unsigned i = 1; i != NumElems; ++i) { 05404 Mask.push_back(i); 05405 } 05406 break; 05407 } 05408 case X86ISD::VPERM2X128: 05409 ImmN = N->getOperand(N->getNumOperands()-1); 05410 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); 05411 if (Mask.empty()) return false; 05412 break; 05413 case X86ISD::MOVSLDUP: 05414 DecodeMOVSLDUPMask(VT, Mask); 05415 break; 05416 case X86ISD::MOVSHDUP: 05417 DecodeMOVSHDUPMask(VT, Mask); 05418 break; 05419 case X86ISD::MOVDDUP: 05420 case X86ISD::MOVLHPD: 05421 case X86ISD::MOVLPD: 05422 case X86ISD::MOVLPS: 05423 // Not yet implemented 05424 return false; 05425 default: llvm_unreachable("unknown target shuffle node"); 05426 } 05427 05428 // If we have a fake unary shuffle, the shuffle mask is spread across two 05429 // inputs that are actually the same node. Re-map the mask to always point 05430 // into the first input. 05431 if (IsFakeUnary) 05432 for (int &M : Mask) 05433 if (M >= (int)Mask.size()) 05434 M -= Mask.size(); 05435 05436 return true; 05437 } 05438 05439 /// getShuffleScalarElt - Returns the scalar element that will make up the ith 05440 /// element of the result of the vector shuffle. 05441 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, 05442 unsigned Depth) { 05443 if (Depth == 6) 05444 return SDValue(); // Limit search depth. 05445 05446 SDValue V = SDValue(N, 0); 05447 EVT VT = V.getValueType(); 05448 unsigned Opcode = V.getOpcode(); 05449 05450 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 05451 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 05452 int Elt = SV->getMaskElt(Index); 05453 05454 if (Elt < 0) 05455 return DAG.getUNDEF(VT.getVectorElementType()); 05456 05457 unsigned NumElems = VT.getVectorNumElements(); 05458 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) 05459 : SV->getOperand(1); 05460 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); 05461 } 05462 05463 // Recurse into target specific vector shuffles to find scalars. 05464 if (isTargetShuffle(Opcode)) { 05465 MVT ShufVT = V.getSimpleValueType(); 05466 unsigned NumElems = ShufVT.getVectorNumElements(); 05467 SmallVector<int, 16> ShuffleMask; 05468 bool IsUnary; 05469 05470 if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) 05471 return SDValue(); 05472 05473 int Elt = ShuffleMask[Index]; 05474 if (Elt < 0) 05475 return DAG.getUNDEF(ShufVT.getVectorElementType()); 05476 05477 SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) 05478 : N->getOperand(1); 05479 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, 05480 Depth+1); 05481 } 05482 05483 // Actual nodes that may contain scalar elements 05484 if (Opcode == ISD::BITCAST) { 05485 V = V.getOperand(0); 05486 EVT SrcVT = V.getValueType(); 05487 unsigned NumElems = VT.getVectorNumElements(); 05488 05489 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 05490 return SDValue(); 05491 } 05492 05493 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 05494 return (Index == 0) ? V.getOperand(0) 05495 : DAG.getUNDEF(VT.getVectorElementType()); 05496 05497 if (V.getOpcode() == ISD::BUILD_VECTOR) 05498 return V.getOperand(Index); 05499 05500 return SDValue(); 05501 } 05502 05503 /// getNumOfConsecutiveZeros - Return the number of elements of a vector 05504 /// shuffle operation which come from a consecutively from a zero. The 05505 /// search can start in two different directions, from left or right. 05506 /// We count undefs as zeros until PreferredNum is reached. 05507 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, 05508 unsigned NumElems, bool ZerosFromLeft, 05509 SelectionDAG &DAG, 05510 unsigned PreferredNum = -1U) { 05511 unsigned NumZeros = 0; 05512 for (unsigned i = 0; i != NumElems; ++i) { 05513 unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; 05514 SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); 05515 if (!Elt.getNode()) 05516 break; 05517 05518 if (X86::isZeroNode(Elt)) 05519 ++NumZeros; 05520 else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. 05521 NumZeros = std::min(NumZeros + 1, PreferredNum); 05522 else 05523 break; 05524 } 05525 05526 return NumZeros; 05527 } 05528 05529 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) 05530 /// correspond consecutively to elements from one of the vector operands, 05531 /// starting from its index OpIdx. Also tell OpNum which source vector operand. 05532 static 05533 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, 05534 unsigned MaskI, unsigned MaskE, unsigned OpIdx, 05535 unsigned NumElems, unsigned &OpNum) { 05536 bool SeenV1 = false; 05537 bool SeenV2 = false; 05538 05539 for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { 05540 int Idx = SVOp->getMaskElt(i); 05541 // Ignore undef indicies 05542 if (Idx < 0) 05543 continue; 05544 05545 if (Idx < (int)NumElems) 05546 SeenV1 = true; 05547 else 05548 SeenV2 = true; 05549 05550 // Only accept consecutive elements from the same vector 05551 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 05552 return false; 05553 } 05554 05555 OpNum = SeenV1 ? 0 : 1; 05556 return true; 05557 } 05558 05559 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a 05560 /// logical left shift of a vector. 05561 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 05562 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 05563 unsigned NumElems = 05564 SVOp->getSimpleValueType(0).getVectorNumElements(); 05565 unsigned NumZeros = getNumOfConsecutiveZeros( 05566 SVOp, NumElems, false /* check zeros from right */, DAG, 05567 SVOp->getMaskElt(0)); 05568 unsigned OpSrc; 05569 05570 if (!NumZeros) 05571 return false; 05572 05573 // Considering the elements in the mask that are not consecutive zeros, 05574 // check if they consecutively come from only one of the source vectors. 05575 // 05576 // V1 = {X, A, B, C} 0 05577 // \ \ \ / 05578 // vector_shuffle V1, V2 <1, 2, 3, X> 05579 // 05580 if (!isShuffleMaskConsecutive(SVOp, 05581 0, // Mask Start Index 05582 NumElems-NumZeros, // Mask End Index(exclusive) 05583 NumZeros, // Where to start looking in the src vector 05584 NumElems, // Number of elements in vector 05585 OpSrc)) // Which source operand ? 05586 return false; 05587 05588 isLeft = false; 05589 ShAmt = NumZeros; 05590 ShVal = SVOp->getOperand(OpSrc); 05591 return true; 05592 } 05593 05594 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 05595 /// logical left shift of a vector. 05596 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 05597 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 05598 unsigned NumElems = 05599 SVOp->getSimpleValueType(0).getVectorNumElements(); 05600 unsigned NumZeros = getNumOfConsecutiveZeros( 05601 SVOp, NumElems, true /* check zeros from left */, DAG, 05602 NumElems - SVOp->getMaskElt(NumElems - 1) - 1); 05603 unsigned OpSrc; 05604 05605 if (!NumZeros) 05606 return false; 05607 05608 // Considering the elements in the mask that are not consecutive zeros, 05609 // check if they consecutively come from only one of the source vectors. 05610 // 05611 // 0 { A, B, X, X } = V2 05612 // / \ / / 05613 // vector_shuffle V1, V2 <X, X, 4, 5> 05614 // 05615 if (!isShuffleMaskConsecutive(SVOp, 05616 NumZeros, // Mask Start Index 05617 NumElems, // Mask End Index(exclusive) 05618 0, // Where to start looking in the src vector 05619 NumElems, // Number of elements in vector 05620 OpSrc)) // Which source operand ? 05621 return false; 05622 05623 isLeft = true; 05624 ShAmt = NumZeros; 05625 ShVal = SVOp->getOperand(OpSrc); 05626 return true; 05627 } 05628 05629 /// isVectorShift - Returns true if the shuffle can be implemented as a 05630 /// logical left or right shift of a vector. 05631 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 05632 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 05633 // Although the logic below support any bitwidth size, there are no 05634 // shift instructions which handle more than 128-bit vectors. 05635 if (!SVOp->getSimpleValueType(0).is128BitVector()) 05636 return false; 05637 05638 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 05639 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 05640 return true; 05641 05642 return false; 05643 } 05644 05645 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 05646 /// 05647 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 05648 unsigned NumNonZero, unsigned NumZero, 05649 SelectionDAG &DAG, 05650 const X86Subtarget* Subtarget, 05651 const TargetLowering &TLI) { 05652 if (NumNonZero > 8) 05653 return SDValue(); 05654 05655 SDLoc dl(Op); 05656 SDValue V; 05657 bool First = true; 05658 for (unsigned i = 0; i < 16; ++i) { 05659 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 05660 if (ThisIsNonZero && First) { 05661 if (NumZero) 05662 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 05663 else 05664 V = DAG.getUNDEF(MVT::v8i16); 05665 First = false; 05666 } 05667 05668 if ((i & 1) != 0) { 05669 SDValue ThisElt, LastElt; 05670 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 05671 if (LastIsNonZero) { 05672 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 05673 MVT::i16, Op.getOperand(i-1)); 05674 } 05675 if (ThisIsNonZero) { 05676 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 05677 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 05678 ThisElt, DAG.getConstant(8, MVT::i8)); 05679 if (LastIsNonZero) 05680 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 05681 } else 05682 ThisElt = LastElt; 05683 05684 if (ThisElt.getNode()) 05685 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 05686 DAG.getIntPtrConstant(i/2)); 05687 } 05688 } 05689 05690 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 05691 } 05692 05693 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 05694 /// 05695 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 05696 unsigned NumNonZero, unsigned NumZero, 05697 SelectionDAG &DAG, 05698 const X86Subtarget* Subtarget, 05699 const TargetLowering &TLI) { 05700 if (NumNonZero > 4) 05701 return SDValue(); 05702 05703 SDLoc dl(Op); 05704 SDValue V; 05705 bool First = true; 05706 for (unsigned i = 0; i < 8; ++i) { 05707 bool isNonZero = (NonZeros & (1 << i)) != 0; 05708 if (isNonZero) { 05709 if (First) { 05710 if (NumZero) 05711 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); 05712 else 05713 V = DAG.getUNDEF(MVT::v8i16); 05714 First = false; 05715 } 05716 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 05717 MVT::v8i16, V, Op.getOperand(i), 05718 DAG.getIntPtrConstant(i)); 05719 } 05720 } 05721 05722 return V; 05723 } 05724 05725 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. 05726 static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, 05727 unsigned NonZeros, unsigned NumNonZero, 05728 unsigned NumZero, SelectionDAG &DAG, 05729 const X86Subtarget *Subtarget, 05730 const TargetLowering &TLI) { 05731 // We know there's at least one non-zero element 05732 unsigned FirstNonZeroIdx = 0; 05733 SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); 05734 while (FirstNonZero.getOpcode() == ISD::UNDEF || 05735 X86::isZeroNode(FirstNonZero)) { 05736 ++FirstNonZeroIdx; 05737 FirstNonZero = Op->getOperand(FirstNonZeroIdx); 05738 } 05739 05740 if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 05741 !isa<ConstantSDNode>(FirstNonZero.getOperand(1))) 05742 return SDValue(); 05743 05744 SDValue V = FirstNonZero.getOperand(0); 05745 MVT VVT = V.getSimpleValueType(); 05746 if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) 05747 return SDValue(); 05748 05749 unsigned FirstNonZeroDst = 05750 cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue(); 05751 unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; 05752 unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; 05753 unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; 05754 05755 for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { 05756 SDValue Elem = Op.getOperand(Idx); 05757 if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) 05758 continue; 05759 05760 // TODO: What else can be here? Deal with it. 05761 if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 05762 return SDValue(); 05763 05764 // TODO: Some optimizations are still possible here 05765 // ex: Getting one element from a vector, and the rest from another. 05766 if (Elem.getOperand(0) != V) 05767 return SDValue(); 05768 05769 unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue(); 05770 if (Dst == Idx) 05771 ++CorrectIdx; 05772 else if (IncorrectIdx == -1U) { 05773 IncorrectIdx = Idx; 05774 IncorrectDst = Dst; 05775 } else 05776 // There was already one element with an incorrect index. 05777 // We can't optimize this case to an insertps. 05778 return SDValue(); 05779 } 05780 05781 if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { 05782 SDLoc dl(Op); 05783 EVT VT = Op.getSimpleValueType(); 05784 unsigned ElementMoveMask = 0; 05785 if (IncorrectIdx == -1U) 05786 ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; 05787 else 05788 ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; 05789 05790 SDValue InsertpsMask = 05791 DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); 05792 return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); 05793 } 05794 05795 return SDValue(); 05796 } 05797 05798 /// getVShift - Return a vector logical shift node. 05799 /// 05800 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 05801 unsigned NumBits, SelectionDAG &DAG, 05802 const TargetLowering &TLI, SDLoc dl) { 05803 assert(VT.is128BitVector() && "Unknown type for VShift"); 05804 EVT ShVT = MVT::v2i64; 05805 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; 05806 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 05807 return DAG.getNode(ISD::BITCAST, dl, VT, 05808 DAG.getNode(Opc, dl, ShVT, SrcOp, 05809 DAG.getConstant(NumBits, 05810 TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); 05811 } 05812 05813 static SDValue 05814 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { 05815 05816 // Check if the scalar load can be widened into a vector load. And if 05817 // the address is "base + cst" see if the cst can be "absorbed" into 05818 // the shuffle mask. 05819 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 05820 SDValue Ptr = LD->getBasePtr(); 05821 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 05822 return SDValue(); 05823 EVT PVT = LD->getValueType(0); 05824 if (PVT != MVT::i32 && PVT != MVT::f32) 05825 return SDValue(); 05826 05827 int FI = -1; 05828 int64_t Offset = 0; 05829 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 05830 FI = FINode->getIndex(); 05831 Offset = 0; 05832 } else if (DAG.isBaseWithConstantOffset(Ptr) && 05833 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 05834 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 05835 Offset = Ptr.getConstantOperandVal(1); 05836 Ptr = Ptr.getOperand(0); 05837 } else { 05838 return SDValue(); 05839 } 05840 05841 // FIXME: 256-bit vector instructions don't require a strict alignment, 05842 // improve this code to support it better. 05843 unsigned RequiredAlign = VT.getSizeInBits()/8; 05844 SDValue Chain = LD->getChain(); 05845 // Make sure the stack object alignment is at least 16 or 32. 05846 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 05847 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { 05848 if (MFI->isFixedObjectIndex(FI)) { 05849 // Can't change the alignment. FIXME: It's possible to compute 05850 // the exact stack offset and reference FI + adjust offset instead. 05851 // If someone *really* cares about this. That's the way to implement it. 05852 return SDValue(); 05853 } else { 05854 MFI->setObjectAlignment(FI, RequiredAlign); 05855 } 05856 } 05857 05858 // (Offset % 16 or 32) must be multiple of 4. Then address is then 05859 // Ptr + (Offset & ~15). 05860 if (Offset < 0) 05861 return SDValue(); 05862 if ((Offset % RequiredAlign) & 3) 05863 return SDValue(); 05864 int64_t StartOffset = Offset & ~(RequiredAlign-1); 05865 if (StartOffset) 05866 Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), 05867 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 05868 05869 int EltNo = (Offset - StartOffset) >> 2; 05870 unsigned NumElems = VT.getVectorNumElements(); 05871 05872 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); 05873 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, 05874 LD->getPointerInfo().getWithOffset(StartOffset), 05875 false, false, false, 0); 05876 05877 SmallVector<int, 8> Mask; 05878 for (unsigned i = 0; i != NumElems; ++i) 05879 Mask.push_back(EltNo); 05880 05881 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); 05882 } 05883 05884 return SDValue(); 05885 } 05886 05887 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 05888 /// vector of type 'VT', see if the elements can be replaced by a single large 05889 /// load which has the same value as a build_vector whose operands are 'elts'. 05890 /// 05891 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 05892 /// 05893 /// FIXME: we'd also like to handle the case where the last elements are zero 05894 /// rather than undef via VZEXT_LOAD, but we do not detect that case today. 05895 /// There's even a handy isZeroNode for that purpose. 05896 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 05897 SDLoc &DL, SelectionDAG &DAG, 05898 bool isAfterLegalize) { 05899 EVT EltVT = VT.getVectorElementType(); 05900 unsigned NumElems = Elts.size(); 05901 05902 LoadSDNode *LDBase = nullptr; 05903 unsigned LastLoadedElt = -1U; 05904 05905 // For each element in the initializer, see if we've found a load or an undef. 05906 // If we don't find an initial load element, or later load elements are 05907 // non-consecutive, bail out. 05908 for (unsigned i = 0; i < NumElems; ++i) { 05909 SDValue Elt = Elts[i]; 05910 05911 if (!Elt.getNode() || 05912 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 05913 return SDValue(); 05914 if (!LDBase) { 05915 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 05916 return SDValue(); 05917 LDBase = cast<LoadSDNode>(Elt.getNode()); 05918 LastLoadedElt = i; 05919 continue; 05920 } 05921 if (Elt.getOpcode() == ISD::UNDEF) 05922 continue; 05923 05924 LoadSDNode *LD = cast<LoadSDNode>(Elt); 05925 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 05926 return SDValue(); 05927 LastLoadedElt = i; 05928 } 05929 05930 // If we have found an entire vector of loads and undefs, then return a large 05931 // load of the entire vector width starting at the base pointer. If we found 05932 // consecutive loads for the low half, generate a vzext_load node. 05933 if (LastLoadedElt == NumElems - 1) { 05934 05935 if (isAfterLegalize && 05936 !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) 05937 return SDValue(); 05938 05939 SDValue NewLd = SDValue(); 05940 05941 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 05942 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 05943 LDBase->getPointerInfo(), 05944 LDBase->isVolatile(), LDBase->isNonTemporal(), 05945 LDBase->isInvariant(), 0); 05946 NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 05947 LDBase->getPointerInfo(), 05948 LDBase->isVolatile(), LDBase->isNonTemporal(), 05949 LDBase->isInvariant(), LDBase->getAlignment()); 05950 05951 if (LDBase->hasAnyUseOfValue(1)) { 05952 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 05953 SDValue(LDBase, 1), 05954 SDValue(NewLd.getNode(), 1)); 05955 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 05956 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 05957 SDValue(NewLd.getNode(), 1)); 05958 } 05959 05960 return NewLd; 05961 } 05962 if (NumElems == 4 && LastLoadedElt == 1 && 05963 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 05964 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 05965 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 05966 SDValue ResNode = 05967 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, 05968 LDBase->getPointerInfo(), 05969 LDBase->getAlignment(), 05970 false/*isVolatile*/, true/*ReadMem*/, 05971 false/*WriteMem*/); 05972 05973 // Make sure the newly-created LOAD is in the same position as LDBase in 05974 // terms of dependency. We create a TokenFactor for LDBase and ResNode, and 05975 // update uses of LDBase's output chain to use the TokenFactor. 05976 if (LDBase->hasAnyUseOfValue(1)) { 05977 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 05978 SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); 05979 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); 05980 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), 05981 SDValue(ResNode.getNode(), 1)); 05982 } 05983 05984 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 05985 } 05986 return SDValue(); 05987 } 05988 05989 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction 05990 /// to generate a splat value for the following cases: 05991 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. 05992 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from 05993 /// a scalar load, or a constant. 05994 /// The VBROADCAST node is returned when a pattern is found, 05995 /// or SDValue() otherwise. 05996 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, 05997 SelectionDAG &DAG) { 05998 if (!Subtarget->hasFp256()) 05999 return SDValue(); 06000 06001 MVT VT = Op.getSimpleValueType(); 06002 SDLoc dl(Op); 06003 06004 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && 06005 "Unsupported vector type for broadcast."); 06006 06007 SDValue Ld; 06008 bool ConstSplatVal; 06009 06010 switch (Op.getOpcode()) { 06011 default: 06012 // Unknown pattern found. 06013 return SDValue(); 06014 06015 case ISD::BUILD_VECTOR: { 06016 auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); 06017 BitVector UndefElements; 06018 SDValue Splat = BVOp->getSplatValue(&UndefElements); 06019 06020 // We need a splat of a single value to use broadcast, and it doesn't 06021 // make any sense if the value is only in one element of the vector. 06022 if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) 06023 return SDValue(); 06024 06025 Ld = Splat; 06026 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 06027 Ld.getOpcode() == ISD::ConstantFP); 06028 06029 // Make sure that all of the users of a non-constant load are from the 06030 // BUILD_VECTOR node. 06031 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) 06032 return SDValue(); 06033 break; 06034 } 06035 06036 case ISD::VECTOR_SHUFFLE: { 06037 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 06038 06039 // Shuffles must have a splat mask where the first element is 06040 // broadcasted. 06041 if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) 06042 return SDValue(); 06043 06044 SDValue Sc = Op.getOperand(0); 06045 if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && 06046 Sc.getOpcode() != ISD::BUILD_VECTOR) { 06047 06048 if (!Subtarget->hasInt256()) 06049 return SDValue(); 06050 06051 // Use the register form of the broadcast instruction available on AVX2. 06052 if (VT.getSizeInBits() >= 256) 06053 Sc = Extract128BitVector(Sc, 0, DAG, dl); 06054 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); 06055 } 06056 06057 Ld = Sc.getOperand(0); 06058 ConstSplatVal = (Ld.getOpcode() == ISD::Constant || 06059 Ld.getOpcode() == ISD::ConstantFP); 06060 06061 // The scalar_to_vector node and the suspected 06062 // load node must have exactly one user. 06063 // Constants may have multiple users. 06064 06065 // AVX-512 has register version of the broadcast 06066 bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && 06067 Ld.getValueType().getSizeInBits() >= 32; 06068 if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && 06069 !hasRegVer)) 06070 return SDValue(); 06071 break; 06072 } 06073 } 06074 06075 bool IsGE256 = (VT.getSizeInBits() >= 256); 06076 06077 // Handle the broadcasting a single constant scalar from the constant pool 06078 // into a vector. On Sandybridge it is still better to load a constant vector 06079 // from the constant pool and not to broadcast it from a scalar. 06080 if (ConstSplatVal && Subtarget->hasInt256()) { 06081 EVT CVT = Ld.getValueType(); 06082 assert(!CVT.isVector() && "Must not broadcast a vector type"); 06083 unsigned ScalarSize = CVT.getSizeInBits(); 06084 06085 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { 06086 const Constant *C = nullptr; 06087 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) 06088 C = CI->getConstantIntValue(); 06089 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) 06090 C = CF->getConstantFPValue(); 06091 06092 assert(C && "Invalid constant type"); 06093 06094 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 06095 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 06096 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 06097 Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, 06098 MachinePointerInfo::getConstantPool(), 06099 false, false, false, Alignment); 06100 06101 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 06102 } 06103 } 06104 06105 bool IsLoad = ISD::isNormalLoad(Ld.getNode()); 06106 unsigned ScalarSize = Ld.getValueType().getSizeInBits(); 06107 06108 // Handle AVX2 in-register broadcasts. 06109 if (!IsLoad && Subtarget->hasInt256() && 06110 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) 06111 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 06112 06113 // The scalar source must be a normal load. 06114 if (!IsLoad) 06115 return SDValue(); 06116 06117 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) 06118 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 06119 06120 // The integer check is needed for the 64-bit into 128-bit so it doesn't match 06121 // double since there is no vbroadcastsd xmm 06122 if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { 06123 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) 06124 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); 06125 } 06126 06127 // Unsupported broadcast. 06128 return SDValue(); 06129 } 06130 06131 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real 06132 /// underlying vector and index. 06133 /// 06134 /// Modifies \p ExtractedFromVec to the real vector and returns the real 06135 /// index. 06136 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, 06137 SDValue ExtIdx) { 06138 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); 06139 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) 06140 return Idx; 06141 06142 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already 06143 // lowered this: 06144 // (extract_vector_elt (v8f32 %vreg1), Constant<6>) 06145 // to: 06146 // (extract_vector_elt (vector_shuffle<2,u,u,u> 06147 // (extract_subvector (v8f32 %vreg0), Constant<4>), 06148 // undef) 06149 // Constant<0>) 06150 // In this case the vector is the extract_subvector expression and the index 06151 // is 2, as specified by the shuffle. 06152 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); 06153 SDValue ShuffleVec = SVOp->getOperand(0); 06154 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); 06155 assert(ShuffleVecVT.getVectorElementType() == 06156 ExtractedFromVec.getSimpleValueType().getVectorElementType()); 06157 06158 int ShuffleIdx = SVOp->getMaskElt(Idx); 06159 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { 06160 ExtractedFromVec = ShuffleVec; 06161 return ShuffleIdx; 06162 } 06163 return Idx; 06164 } 06165 06166 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { 06167 MVT VT = Op.getSimpleValueType(); 06168 06169 // Skip if insert_vec_elt is not supported. 06170 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 06171 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) 06172 return SDValue(); 06173 06174 SDLoc DL(Op); 06175 unsigned NumElems = Op.getNumOperands(); 06176 06177 SDValue VecIn1; 06178 SDValue VecIn2; 06179 SmallVector<unsigned, 4> InsertIndices; 06180 SmallVector<int, 8> Mask(NumElems, -1); 06181 06182 for (unsigned i = 0; i != NumElems; ++i) { 06183 unsigned Opc = Op.getOperand(i).getOpcode(); 06184 06185 if (Opc == ISD::UNDEF) 06186 continue; 06187 06188 if (Opc != ISD::EXTRACT_VECTOR_ELT) { 06189 // Quit if more than 1 elements need inserting. 06190 if (InsertIndices.size() > 1) 06191 return SDValue(); 06192 06193 InsertIndices.push_back(i); 06194 continue; 06195 } 06196 06197 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); 06198 SDValue ExtIdx = Op.getOperand(i).getOperand(1); 06199 // Quit if non-constant index. 06200 if (!isa<ConstantSDNode>(ExtIdx)) 06201 return SDValue(); 06202 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); 06203 06204 // Quit if extracted from vector of different type. 06205 if (ExtractedFromVec.getValueType() != VT) 06206 return SDValue(); 06207 06208 if (!VecIn1.getNode()) 06209 VecIn1 = ExtractedFromVec; 06210 else if (VecIn1 != ExtractedFromVec) { 06211 if (!VecIn2.getNode()) 06212 VecIn2 = ExtractedFromVec; 06213 else if (VecIn2 != ExtractedFromVec) 06214 // Quit if more than 2 vectors to shuffle 06215 return SDValue(); 06216 } 06217 06218 if (ExtractedFromVec == VecIn1) 06219 Mask[i] = Idx; 06220 else if (ExtractedFromVec == VecIn2) 06221 Mask[i] = Idx + NumElems; 06222 } 06223 06224 if (!VecIn1.getNode()) 06225 return SDValue(); 06226 06227 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); 06228 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); 06229 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { 06230 unsigned Idx = InsertIndices[i]; 06231 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), 06232 DAG.getIntPtrConstant(Idx)); 06233 } 06234 06235 return NV; 06236 } 06237 06238 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. 06239 SDValue 06240 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { 06241 06242 MVT VT = Op.getSimpleValueType(); 06243 assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && 06244 "Unexpected type in LowerBUILD_VECTORvXi1!"); 06245 06246 SDLoc dl(Op); 06247 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 06248 SDValue Cst = DAG.getTargetConstant(0, MVT::i1); 06249 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 06250 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 06251 } 06252 06253 if (ISD::isBuildVectorAllOnes(Op.getNode())) { 06254 SDValue Cst = DAG.getTargetConstant(1, MVT::i1); 06255 SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); 06256 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 06257 } 06258 06259 bool AllContants = true; 06260 uint64_t Immediate = 0; 06261 int NonConstIdx = -1; 06262 bool IsSplat = true; 06263 unsigned NumNonConsts = 0; 06264 unsigned NumConsts = 0; 06265 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { 06266 SDValue In = Op.getOperand(idx); 06267 if (In.getOpcode() == ISD::UNDEF) 06268 continue; 06269 if (!isa<ConstantSDNode>(In)) { 06270 AllContants = false; 06271 NonConstIdx = idx; 06272 NumNonConsts++; 06273 } 06274 else { 06275 NumConsts++; 06276 if (cast<ConstantSDNode>(In)->getZExtValue()) 06277 Immediate |= (1ULL << idx); 06278 } 06279 if (In != Op.getOperand(0)) 06280 IsSplat = false; 06281 } 06282 06283 if (AllContants) { 06284 SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, 06285 DAG.getConstant(Immediate, MVT::i16)); 06286 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, 06287 DAG.getIntPtrConstant(0)); 06288 } 06289 06290 if (NumNonConsts == 1 && NonConstIdx != 0) { 06291 SDValue DstVec; 06292 if (NumConsts) { 06293 SDValue VecAsImm = DAG.getConstant(Immediate, 06294 MVT::getIntegerVT(VT.getSizeInBits())); 06295 DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); 06296 } 06297 else 06298 DstVec = DAG.getUNDEF(VT); 06299 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 06300 Op.getOperand(NonConstIdx), 06301 DAG.getIntPtrConstant(NonConstIdx)); 06302 } 06303 if (!IsSplat && (NonConstIdx != 0)) 06304 llvm_unreachable("Unsupported BUILD_VECTOR operation"); 06305 MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; 06306 SDValue Select; 06307 if (IsSplat) 06308 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), 06309 DAG.getConstant(-1, SelectVT), 06310 DAG.getConstant(0, SelectVT)); 06311 else 06312 Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), 06313 DAG.getConstant((Immediate | 1), SelectVT), 06314 DAG.getConstant(Immediate, SelectVT)); 06315 return DAG.getNode(ISD::BITCAST, dl, VT, Select); 06316 } 06317 06318 /// \brief Return true if \p N implements a horizontal binop and return the 06319 /// operands for the horizontal binop into V0 and V1. 06320 /// 06321 /// This is a helper function of PerformBUILD_VECTORCombine. 06322 /// This function checks that the build_vector \p N in input implements a 06323 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal 06324 /// operation to match. 06325 /// For example, if \p Opcode is equal to ISD::ADD, then this function 06326 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode 06327 /// is equal to ISD::SUB, then this function checks if this is a horizontal 06328 /// arithmetic sub. 06329 /// 06330 /// This function only analyzes elements of \p N whose indices are 06331 /// in range [BaseIdx, LastIdx). 06332 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, 06333 SelectionDAG &DAG, 06334 unsigned BaseIdx, unsigned LastIdx, 06335 SDValue &V0, SDValue &V1) { 06336 EVT VT = N->getValueType(0); 06337 06338 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); 06339 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && 06340 "Invalid Vector in input!"); 06341 06342 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); 06343 bool CanFold = true; 06344 unsigned ExpectedVExtractIdx = BaseIdx; 06345 unsigned NumElts = LastIdx - BaseIdx; 06346 V0 = DAG.getUNDEF(VT); 06347 V1 = DAG.getUNDEF(VT); 06348 06349 // Check if N implements a horizontal binop. 06350 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { 06351 SDValue Op = N->getOperand(i + BaseIdx); 06352 06353 // Skip UNDEFs. 06354 if (Op->getOpcode() == ISD::UNDEF) { 06355 // Update the expected vector extract index. 06356 if (i * 2 == NumElts) 06357 ExpectedVExtractIdx = BaseIdx; 06358 ExpectedVExtractIdx += 2; 06359 continue; 06360 } 06361 06362 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); 06363 06364 if (!CanFold) 06365 break; 06366 06367 SDValue Op0 = Op.getOperand(0); 06368 SDValue Op1 = Op.getOperand(1); 06369 06370 // Try to match the following pattern: 06371 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) 06372 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 06373 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 06374 Op0.getOperand(0) == Op1.getOperand(0) && 06375 isa<ConstantSDNode>(Op0.getOperand(1)) && 06376 isa<ConstantSDNode>(Op1.getOperand(1))); 06377 if (!CanFold) 06378 break; 06379 06380 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 06381 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); 06382 06383 if (i * 2 < NumElts) { 06384 if (V0.getOpcode() == ISD::UNDEF) 06385 V0 = Op0.getOperand(0); 06386 } else { 06387 if (V1.getOpcode() == ISD::UNDEF) 06388 V1 = Op0.getOperand(0); 06389 if (i * 2 == NumElts) 06390 ExpectedVExtractIdx = BaseIdx; 06391 } 06392 06393 SDValue Expected = (i * 2 < NumElts) ? V0 : V1; 06394 if (I0 == ExpectedVExtractIdx) 06395 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; 06396 else if (IsCommutable && I1 == ExpectedVExtractIdx) { 06397 // Try to match the following dag sequence: 06398 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) 06399 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; 06400 } else 06401 CanFold = false; 06402 06403 ExpectedVExtractIdx += 2; 06404 } 06405 06406 return CanFold; 06407 } 06408 06409 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by 06410 /// a concat_vector. 06411 /// 06412 /// This is a helper function of PerformBUILD_VECTORCombine. 06413 /// This function expects two 256-bit vectors called V0 and V1. 06414 /// At first, each vector is split into two separate 128-bit vectors. 06415 /// Then, the resulting 128-bit vectors are used to implement two 06416 /// horizontal binary operations. 06417 /// 06418 /// The kind of horizontal binary operation is defined by \p X86Opcode. 06419 /// 06420 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to 06421 /// the two new horizontal binop. 06422 /// When Mode is set, the first horizontal binop dag node would take as input 06423 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second 06424 /// horizontal binop dag node would take as input the lower 128-bit of V1 06425 /// and the upper 128-bit of V1. 06426 /// Example: 06427 /// HADD V0_LO, V0_HI 06428 /// HADD V1_LO, V1_HI 06429 /// 06430 /// Otherwise, the first horizontal binop dag node takes as input the lower 06431 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop 06432 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. 06433 /// Example: 06434 /// HADD V0_LO, V1_LO 06435 /// HADD V0_HI, V1_HI 06436 /// 06437 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower 06438 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to 06439 /// the upper 128-bits of the result. 06440 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, 06441 SDLoc DL, SelectionDAG &DAG, 06442 unsigned X86Opcode, bool Mode, 06443 bool isUndefLO, bool isUndefHI) { 06444 EVT VT = V0.getValueType(); 06445 assert(VT.is256BitVector() && VT == V1.getValueType() && 06446 "Invalid nodes in input!"); 06447 06448 unsigned NumElts = VT.getVectorNumElements(); 06449 SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); 06450 SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); 06451 SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); 06452 SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); 06453 EVT NewVT = V0_LO.getValueType(); 06454 06455 SDValue LO = DAG.getUNDEF(NewVT); 06456 SDValue HI = DAG.getUNDEF(NewVT); 06457 06458 if (Mode) { 06459 // Don't emit a horizontal binop if the result is expected to be UNDEF. 06460 if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) 06461 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); 06462 if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) 06463 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); 06464 } else { 06465 // Don't emit a horizontal binop if the result is expected to be UNDEF. 06466 if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || 06467 V1_LO->getOpcode() != ISD::UNDEF)) 06468 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); 06469 06470 if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || 06471 V1_HI->getOpcode() != ISD::UNDEF)) 06472 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); 06473 } 06474 06475 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); 06476 } 06477 06478 /// \brief Try to fold a build_vector that performs an 'addsub' into the 06479 /// sequence of 'vadd + vsub + blendi'. 06480 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, 06481 const X86Subtarget *Subtarget) { 06482 SDLoc DL(BV); 06483 EVT VT = BV->getValueType(0); 06484 unsigned NumElts = VT.getVectorNumElements(); 06485 SDValue InVec0 = DAG.getUNDEF(VT); 06486 SDValue InVec1 = DAG.getUNDEF(VT); 06487 06488 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || 06489 VT == MVT::v2f64) && "build_vector with an invalid type found!"); 06490 06491 // Odd-numbered elements in the input build vector are obtained from 06492 // adding two integer/float elements. 06493 // Even-numbered elements in the input build vector are obtained from 06494 // subtracting two integer/float elements. 06495 unsigned ExpectedOpcode = ISD::FSUB; 06496 unsigned NextExpectedOpcode = ISD::FADD; 06497 bool AddFound = false; 06498 bool SubFound = false; 06499 06500 for (unsigned i = 0, e = NumElts; i != e; i++) { 06501 SDValue Op = BV->getOperand(i); 06502 06503 // Skip 'undef' values. 06504 unsigned Opcode = Op.getOpcode(); 06505 if (Opcode == ISD::UNDEF) { 06506 std::swap(ExpectedOpcode, NextExpectedOpcode); 06507 continue; 06508 } 06509 06510 // Early exit if we found an unexpected opcode. 06511 if (Opcode != ExpectedOpcode) 06512 return SDValue(); 06513 06514 SDValue Op0 = Op.getOperand(0); 06515 SDValue Op1 = Op.getOperand(1); 06516 06517 // Try to match the following pattern: 06518 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) 06519 // Early exit if we cannot match that sequence. 06520 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 06521 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 06522 !isa<ConstantSDNode>(Op0.getOperand(1)) || 06523 !isa<ConstantSDNode>(Op1.getOperand(1)) || 06524 Op0.getOperand(1) != Op1.getOperand(1)) 06525 return SDValue(); 06526 06527 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); 06528 if (I0 != i) 06529 return SDValue(); 06530 06531 // We found a valid add/sub node. Update the information accordingly. 06532 if (i & 1) 06533 AddFound = true; 06534 else 06535 SubFound = true; 06536 06537 // Update InVec0 and InVec1. 06538 if (InVec0.getOpcode() == ISD::UNDEF) 06539 InVec0 = Op0.getOperand(0); 06540 if (InVec1.getOpcode() == ISD::UNDEF) 06541 InVec1 = Op1.getOperand(0); 06542 06543 // Make sure that operands in input to each add/sub node always 06544 // come from a same pair of vectors. 06545 if (InVec0 != Op0.getOperand(0)) { 06546 if (ExpectedOpcode == ISD::FSUB) 06547 return SDValue(); 06548 06549 // FADD is commutable. Try to commute the operands 06550 // and then test again. 06551 std::swap(Op0, Op1); 06552 if (InVec0 != Op0.getOperand(0)) 06553 return SDValue(); 06554 } 06555 06556 if (InVec1 != Op1.getOperand(0)) 06557 return SDValue(); 06558 06559 // Update the pair of expected opcodes. 06560 std::swap(ExpectedOpcode, NextExpectedOpcode); 06561 } 06562 06563 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. 06564 if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && 06565 InVec1.getOpcode() != ISD::UNDEF) 06566 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); 06567 06568 return SDValue(); 06569 } 06570 06571 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, 06572 const X86Subtarget *Subtarget) { 06573 SDLoc DL(N); 06574 EVT VT = N->getValueType(0); 06575 unsigned NumElts = VT.getVectorNumElements(); 06576 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N); 06577 SDValue InVec0, InVec1; 06578 06579 // Try to match an ADDSUB. 06580 if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 06581 (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { 06582 SDValue Value = matchAddSub(BV, DAG, Subtarget); 06583 if (Value.getNode()) 06584 return Value; 06585 } 06586 06587 // Try to match horizontal ADD/SUB. 06588 unsigned NumUndefsLO = 0; 06589 unsigned NumUndefsHI = 0; 06590 unsigned Half = NumElts/2; 06591 06592 // Count the number of UNDEF operands in the build_vector in input. 06593 for (unsigned i = 0, e = Half; i != e; ++i) 06594 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) 06595 NumUndefsLO++; 06596 06597 for (unsigned i = Half, e = NumElts; i != e; ++i) 06598 if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) 06599 NumUndefsHI++; 06600 06601 // Early exit if this is either a build_vector of all UNDEFs or all the 06602 // operands but one are UNDEF. 06603 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) 06604 return SDValue(); 06605 06606 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { 06607 // Try to match an SSE3 float HADD/HSUB. 06608 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 06609 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 06610 06611 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 06612 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 06613 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { 06614 // Try to match an SSSE3 integer HADD/HSUB. 06615 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 06616 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); 06617 06618 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 06619 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); 06620 } 06621 06622 if (!Subtarget->hasAVX()) 06623 return SDValue(); 06624 06625 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { 06626 // Try to match an AVX horizontal add/sub of packed single/double 06627 // precision floating point values from 256-bit vectors. 06628 SDValue InVec2, InVec3; 06629 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && 06630 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && 06631 ((InVec0.getOpcode() == ISD::UNDEF || 06632 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 06633 ((InVec1.getOpcode() == ISD::UNDEF || 06634 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 06635 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); 06636 06637 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && 06638 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && 06639 ((InVec0.getOpcode() == ISD::UNDEF || 06640 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 06641 ((InVec1.getOpcode() == ISD::UNDEF || 06642 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 06643 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); 06644 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { 06645 // Try to match an AVX2 horizontal add/sub of signed integers. 06646 SDValue InVec2, InVec3; 06647 unsigned X86Opcode; 06648 bool CanFold = true; 06649 06650 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && 06651 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && 06652 ((InVec0.getOpcode() == ISD::UNDEF || 06653 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 06654 ((InVec1.getOpcode() == ISD::UNDEF || 06655 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 06656 X86Opcode = X86ISD::HADD; 06657 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && 06658 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && 06659 ((InVec0.getOpcode() == ISD::UNDEF || 06660 InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && 06661 ((InVec1.getOpcode() == ISD::UNDEF || 06662 InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) 06663 X86Opcode = X86ISD::HSUB; 06664 else 06665 CanFold = false; 06666 06667 if (CanFold) { 06668 // Fold this build_vector into a single horizontal add/sub. 06669 // Do this only if the target has AVX2. 06670 if (Subtarget->hasAVX2()) 06671 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); 06672 06673 // Do not try to expand this build_vector into a pair of horizontal 06674 // add/sub if we can emit a pair of scalar add/sub. 06675 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 06676 return SDValue(); 06677 06678 // Convert this build_vector into a pair of horizontal binop followed by 06679 // a concat vector. 06680 bool isUndefLO = NumUndefsLO == Half; 06681 bool isUndefHI = NumUndefsHI == Half; 06682 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, 06683 isUndefLO, isUndefHI); 06684 } 06685 } 06686 06687 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || 06688 VT == MVT::v16i16) && Subtarget->hasAVX()) { 06689 unsigned X86Opcode; 06690 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) 06691 X86Opcode = X86ISD::HADD; 06692 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) 06693 X86Opcode = X86ISD::HSUB; 06694 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) 06695 X86Opcode = X86ISD::FHADD; 06696 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) 06697 X86Opcode = X86ISD::FHSUB; 06698 else 06699 return SDValue(); 06700 06701 // Don't try to expand this build_vector into a pair of horizontal add/sub 06702 // if we can simply emit a pair of scalar add/sub. 06703 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) 06704 return SDValue(); 06705 06706 // Convert this build_vector into two horizontal add/sub followed by 06707 // a concat vector. 06708 bool isUndefLO = NumUndefsLO == Half; 06709 bool isUndefHI = NumUndefsHI == Half; 06710 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, 06711 isUndefLO, isUndefHI); 06712 } 06713 06714 return SDValue(); 06715 } 06716 06717 SDValue 06718 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 06719 SDLoc dl(Op); 06720 06721 MVT VT = Op.getSimpleValueType(); 06722 MVT ExtVT = VT.getVectorElementType(); 06723 unsigned NumElems = Op.getNumOperands(); 06724 06725 // Generate vectors for predicate vectors. 06726 if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) 06727 return LowerBUILD_VECTORvXi1(Op, DAG); 06728 06729 // Vectors containing all zeros can be matched by pxor and xorps later 06730 if (ISD::isBuildVectorAllZeros(Op.getNode())) { 06731 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd 06732 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. 06733 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) 06734 return Op; 06735 06736 return getZeroVector(VT, Subtarget, DAG, dl); 06737 } 06738 06739 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width 06740 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use 06741 // vpcmpeqd on 256-bit vectors. 06742 if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { 06743 if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) 06744 return Op; 06745 06746 if (!VT.is512BitVector()) 06747 return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); 06748 } 06749 06750 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 06751 if (Broadcast.getNode()) 06752 return Broadcast; 06753 06754 unsigned EVTBits = ExtVT.getSizeInBits(); 06755 06756 unsigned NumZero = 0; 06757 unsigned NumNonZero = 0; 06758 unsigned NonZeros = 0; 06759 bool IsAllConstants = true; 06760 SmallSet<SDValue, 8> Values; 06761 for (unsigned i = 0; i < NumElems; ++i) { 06762 SDValue Elt = Op.getOperand(i); 06763 if (Elt.getOpcode() == ISD::UNDEF) 06764 continue; 06765 Values.insert(Elt); 06766 if (Elt.getOpcode() != ISD::Constant && 06767 Elt.getOpcode() != ISD::ConstantFP) 06768 IsAllConstants = false; 06769 if (X86::isZeroNode(Elt)) 06770 NumZero++; 06771 else { 06772 NonZeros |= (1 << i); 06773 NumNonZero++; 06774 } 06775 } 06776 06777 // All undef vector. Return an UNDEF. All zero vectors were handled above. 06778 if (NumNonZero == 0) 06779 return DAG.getUNDEF(VT); 06780 06781 // Special case for single non-zero, non-undef, element. 06782 if (NumNonZero == 1) { 06783 unsigned Idx = countTrailingZeros(NonZeros); 06784 SDValue Item = Op.getOperand(Idx); 06785 06786 // If this is an insertion of an i64 value on x86-32, and if the top bits of 06787 // the value are obviously zero, truncate the value to i32 and do the 06788 // insertion that way. Only do this if the value is non-constant or if the 06789 // value is a constant being inserted into element 0. It is cheaper to do 06790 // a constant pool load than it is to do a movd + shuffle. 06791 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 06792 (!IsAllConstants || Idx == 0)) { 06793 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 06794 // Handle SSE only. 06795 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 06796 EVT VecVT = MVT::v4i32; 06797 unsigned VecElts = 4; 06798 06799 // Truncate the value (which may itself be a constant) to i32, and 06800 // convert it to a vector with movd (S2V+shuffle to zero extend). 06801 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 06802 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 06803 06804 // If using the new shuffle lowering, just directly insert this. 06805 if (ExperimentalVectorShuffleLowering) 06806 return DAG.getNode( 06807 ISD::BITCAST, dl, VT, 06808 getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); 06809 06810 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 06811 06812 // Now we have our 32-bit value zero extended in the low element of 06813 // a vector. If Idx != 0, swizzle it into place. 06814 if (Idx != 0) { 06815 SmallVector<int, 4> Mask; 06816 Mask.push_back(Idx); 06817 for (unsigned i = 1; i != VecElts; ++i) 06818 Mask.push_back(i); 06819 Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), 06820 &Mask[0]); 06821 } 06822 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 06823 } 06824 } 06825 06826 // If we have a constant or non-constant insertion into the low element of 06827 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 06828 // the rest of the elements. This will be matched as movd/movq/movss/movsd 06829 // depending on what the source datatype is. 06830 if (Idx == 0) { 06831 if (NumZero == 0) 06832 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 06833 06834 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 06835 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 06836 if (VT.is256BitVector() || VT.is512BitVector()) { 06837 SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); 06838 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, 06839 Item, DAG.getIntPtrConstant(0)); 06840 } 06841 assert(VT.is128BitVector() && "Expected an SSE value type!"); 06842 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 06843 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 06844 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 06845 } 06846 06847 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 06848 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 06849 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); 06850 if (VT.is256BitVector()) { 06851 SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); 06852 Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); 06853 } else { 06854 assert(VT.is128BitVector() && "Expected an SSE value type!"); 06855 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); 06856 } 06857 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 06858 } 06859 } 06860 06861 // Is it a vector logical left shift? 06862 if (NumElems == 2 && Idx == 1 && 06863 X86::isZeroNode(Op.getOperand(0)) && 06864 !X86::isZeroNode(Op.getOperand(1))) { 06865 unsigned NumBits = VT.getSizeInBits(); 06866 return getVShift(true, VT, 06867 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 06868 VT, Op.getOperand(1)), 06869 NumBits/2, DAG, *this, dl); 06870 } 06871 06872 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 06873 return SDValue(); 06874 06875 // Otherwise, if this is a vector with i32 or f32 elements, and the element 06876 // is a non-constant being inserted into an element other than the low one, 06877 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 06878 // movd/movss) to move this into the low element, then shuffle it into 06879 // place. 06880 if (EVTBits == 32) { 06881 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 06882 06883 // If using the new shuffle lowering, just directly insert this. 06884 if (ExperimentalVectorShuffleLowering) 06885 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); 06886 06887 // Turn it into a shuffle of zero and zero-extended scalar to vector. 06888 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); 06889 SmallVector<int, 8> MaskVec; 06890 for (unsigned i = 0; i != NumElems; ++i) 06891 MaskVec.push_back(i == Idx ? 0 : 1); 06892 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 06893 } 06894 } 06895 06896 // Splat is obviously ok. Let legalizer expand it to a shuffle. 06897 if (Values.size() == 1) { 06898 if (EVTBits == 32) { 06899 // Instead of a shuffle like this: 06900 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 06901 // Check if it's possible to issue this instead. 06902 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 06903 unsigned Idx = countTrailingZeros(NonZeros); 06904 SDValue Item = Op.getOperand(Idx); 06905 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 06906 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 06907 } 06908 return SDValue(); 06909 } 06910 06911 // A vector full of immediates; various special cases are already 06912 // handled, so this is best done with a single constant-pool load. 06913 if (IsAllConstants) 06914 return SDValue(); 06915 06916 // For AVX-length vectors, build the individual 128-bit pieces and use 06917 // shuffles to put them in place. 06918 if (VT.is256BitVector() || VT.is512BitVector()) { 06919 SmallVector<SDValue, 64> V; 06920 for (unsigned i = 0; i != NumElems; ++i) 06921 V.push_back(Op.getOperand(i)); 06922 06923 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 06924 06925 // Build both the lower and upper subvector. 06926 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, 06927 makeArrayRef(&V[0], NumElems/2)); 06928 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, 06929 makeArrayRef(&V[NumElems / 2], NumElems/2)); 06930 06931 // Recreate the wider vector with the lower and upper part. 06932 if (VT.is256BitVector()) 06933 return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 06934 return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); 06935 } 06936 06937 // Let legalizer expand 2-wide build_vectors. 06938 if (EVTBits == 64) { 06939 if (NumNonZero == 1) { 06940 // One half is zero or undef. 06941 unsigned Idx = countTrailingZeros(NonZeros); 06942 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 06943 Op.getOperand(Idx)); 06944 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); 06945 } 06946 return SDValue(); 06947 } 06948 06949 // If element VT is < 32 bits, convert it to inserts into a zero vector. 06950 if (EVTBits == 8 && NumElems == 16) { 06951 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 06952 Subtarget, *this); 06953 if (V.getNode()) return V; 06954 } 06955 06956 if (EVTBits == 16 && NumElems == 8) { 06957 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 06958 Subtarget, *this); 06959 if (V.getNode()) return V; 06960 } 06961 06962 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS 06963 if (EVTBits == 32 && NumElems == 4) { 06964 SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, 06965 NumZero, DAG, Subtarget, *this); 06966 if (V.getNode()) 06967 return V; 06968 } 06969 06970 // If element VT is == 32 bits, turn it into a number of shuffles. 06971 SmallVector<SDValue, 8> V(NumElems); 06972 if (NumElems == 4 && NumZero > 0) { 06973 for (unsigned i = 0; i < 4; ++i) { 06974 bool isZero = !(NonZeros & (1 << i)); 06975 if (isZero) 06976 V[i] = getZeroVector(VT, Subtarget, DAG, dl); 06977 else 06978 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 06979 } 06980 06981 for (unsigned i = 0; i < 2; ++i) { 06982 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 06983 default: break; 06984 case 0: 06985 V[i] = V[i*2]; // Must be a zero vector. 06986 break; 06987 case 1: 06988 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 06989 break; 06990 case 2: 06991 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 06992 break; 06993 case 3: 06994 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 06995 break; 06996 } 06997 } 06998 06999 bool Reverse1 = (NonZeros & 0x3) == 2; 07000 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; 07001 int MaskVec[] = { 07002 Reverse1 ? 1 : 0, 07003 Reverse1 ? 0 : 1, 07004 static_cast<int>(Reverse2 ? NumElems+1 : NumElems), 07005 static_cast<int>(Reverse2 ? NumElems : NumElems+1) 07006 }; 07007 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 07008 } 07009 07010 if (Values.size() > 1 && VT.is128BitVector()) { 07011 // Check for a build vector of consecutive loads. 07012 for (unsigned i = 0; i < NumElems; ++i) 07013 V[i] = Op.getOperand(i); 07014 07015 // Check for elements which are consecutive loads. 07016 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); 07017 if (LD.getNode()) 07018 return LD; 07019 07020 // Check for a build vector from mostly shuffle plus few inserting. 07021 SDValue Sh = buildFromShuffleMostly(Op, DAG); 07022 if (Sh.getNode()) 07023 return Sh; 07024 07025 // For SSE 4.1, use insertps to put the high elements into the low element. 07026 if (getSubtarget()->hasSSE41()) { 07027 SDValue Result; 07028 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 07029 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 07030 else 07031 Result = DAG.getUNDEF(VT); 07032 07033 for (unsigned i = 1; i < NumElems; ++i) { 07034 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 07035 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 07036 Op.getOperand(i), DAG.getIntPtrConstant(i)); 07037 } 07038 return Result; 07039 } 07040 07041 // Otherwise, expand into a number of unpckl*, start by extending each of 07042 // our (non-undef) elements to the full vector width with the element in the 07043 // bottom slot of the vector (which generates no code for SSE). 07044 for (unsigned i = 0; i < NumElems; ++i) { 07045 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 07046 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 07047 else 07048 V[i] = DAG.getUNDEF(VT); 07049 } 07050 07051 // Next, we iteratively mix elements, e.g. for v4f32: 07052 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 07053 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 07054 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 07055 unsigned EltStride = NumElems >> 1; 07056 while (EltStride != 0) { 07057 for (unsigned i = 0; i < EltStride; ++i) { 07058 // If V[i+EltStride] is undef and this is the first round of mixing, 07059 // then it is safe to just drop this shuffle: V[i] is already in the 07060 // right place, the one element (since it's the first round) being 07061 // inserted as undef can be dropped. This isn't safe for successive 07062 // rounds because they will permute elements within both vectors. 07063 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 07064 EltStride == NumElems/2) 07065 continue; 07066 07067 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 07068 } 07069 EltStride >>= 1; 07070 } 07071 return V[0]; 07072 } 07073 return SDValue(); 07074 } 07075 07076 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction 07077 // to create 256-bit vectors from two other 128-bit ones. 07078 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 07079 SDLoc dl(Op); 07080 MVT ResVT = Op.getSimpleValueType(); 07081 07082 assert((ResVT.is256BitVector() || 07083 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); 07084 07085 SDValue V1 = Op.getOperand(0); 07086 SDValue V2 = Op.getOperand(1); 07087 unsigned NumElems = ResVT.getVectorNumElements(); 07088 if(ResVT.is256BitVector()) 07089 return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 07090 07091 if (Op.getNumOperands() == 4) { 07092 MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), 07093 ResVT.getVectorNumElements()/2); 07094 SDValue V3 = Op.getOperand(2); 07095 SDValue V4 = Op.getOperand(3); 07096 return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), 07097 Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); 07098 } 07099 return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); 07100 } 07101 07102 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 07103 MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); 07104 assert((VT.is256BitVector() && Op.getNumOperands() == 2) || 07105 (VT.is512BitVector() && (Op.getNumOperands() == 2 || 07106 Op.getNumOperands() == 4))); 07107 07108 // AVX can use the vinsertf128 instruction to create 256-bit vectors 07109 // from two other 128-bit ones. 07110 07111 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors 07112 return LowerAVXCONCAT_VECTORS(Op, DAG); 07113 } 07114 07115 07116 //===----------------------------------------------------------------------===// 07117 // Vector shuffle lowering 07118 // 07119 // This is an experimental code path for lowering vector shuffles on x86. It is 07120 // designed to handle arbitrary vector shuffles and blends, gracefully 07121 // degrading performance as necessary. It works hard to recognize idiomatic 07122 // shuffles and lower them to optimal instruction patterns without leaving 07123 // a framework that allows reasonably efficient handling of all vector shuffle 07124 // patterns. 07125 //===----------------------------------------------------------------------===// 07126 07127 /// \brief Tiny helper function to identify a no-op mask. 07128 /// 07129 /// This is a somewhat boring predicate function. It checks whether the mask 07130 /// array input, which is assumed to be a single-input shuffle mask of the kind 07131 /// used by the X86 shuffle instructions (not a fully general 07132 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an 07133 /// in-place shuffle are 'no-op's. 07134 static bool isNoopShuffleMask(ArrayRef<int> Mask) { 07135 for (int i = 0, Size = Mask.size(); i < Size; ++i) 07136 if (Mask[i] != -1 && Mask[i] != i) 07137 return false; 07138 return true; 07139 } 07140 07141 /// \brief Helper function to classify a mask as a single-input mask. 07142 /// 07143 /// This isn't a generic single-input test because in the vector shuffle 07144 /// lowering we canonicalize single inputs to be the first input operand. This 07145 /// means we can more quickly test for a single input by only checking whether 07146 /// an input from the second operand exists. We also assume that the size of 07147 /// mask corresponds to the size of the input vectors which isn't true in the 07148 /// fully general case. 07149 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { 07150 for (int M : Mask) 07151 if (M >= (int)Mask.size()) 07152 return false; 07153 return true; 07154 } 07155 07156 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC 07157 // 2013 will allow us to use it as a non-type template parameter. 07158 namespace { 07159 07160 /// \brief Implementation of the \c isShuffleEquivalent variadic functor. 07161 /// 07162 /// See its documentation for details. 07163 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) { 07164 if (Mask.size() != Args.size()) 07165 return false; 07166 for (int i = 0, e = Mask.size(); i < e; ++i) { 07167 assert(*Args[i] >= 0 && "Arguments must be positive integers!"); 07168 assert(*Args[i] < (int)Args.size() * 2 && 07169 "Argument outside the range of possible shuffle inputs!"); 07170 if (Mask[i] != -1 && Mask[i] != *Args[i]) 07171 return false; 07172 } 07173 return true; 07174 } 07175 07176 } // namespace 07177 07178 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of 07179 /// arguments. 07180 /// 07181 /// This is a fast way to test a shuffle mask against a fixed pattern: 07182 /// 07183 /// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } 07184 /// 07185 /// It returns true if the mask is exactly as wide as the argument list, and 07186 /// each element of the mask is either -1 (signifying undef) or the value given 07187 /// in the argument. 07188 static const VariadicFunction1< 07189 bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; 07190 07191 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. 07192 /// 07193 /// This helper function produces an 8-bit shuffle immediate corresponding to 07194 /// the ubiquitous shuffle encoding scheme used in x86 instructions for 07195 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for 07196 /// example. 07197 /// 07198 /// NB: We rely heavily on "undef" masks preserving the input lane. 07199 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, 07200 SelectionDAG &DAG) { 07201 assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); 07202 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); 07203 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); 07204 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); 07205 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); 07206 07207 unsigned Imm = 0; 07208 Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; 07209 Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; 07210 Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; 07211 Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; 07212 return DAG.getConstant(Imm, MVT::i8); 07213 } 07214 07215 /// \brief Try to emit a blend instruction for a shuffle. 07216 /// 07217 /// This doesn't do any checks for the availability of instructions for blending 07218 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to 07219 /// be matched in the backend with the type given. What it does check for is 07220 /// that the shuffle mask is in fact a blend. 07221 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, 07222 SDValue V2, ArrayRef<int> Mask, 07223 SelectionDAG &DAG) { 07224 07225 unsigned BlendMask = 0; 07226 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 07227 if (Mask[i] >= Size) { 07228 if (Mask[i] != i + Size) 07229 return SDValue(); // Shuffled V2 input! 07230 BlendMask |= 1u << i; 07231 continue; 07232 } 07233 if (Mask[i] >= 0 && Mask[i] != i) 07234 return SDValue(); // Shuffled V1 input! 07235 } 07236 if (VT == MVT::v4f32 || VT == MVT::v2f64) 07237 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, 07238 DAG.getConstant(BlendMask, MVT::i8)); 07239 assert(!VT.isFloatingPoint() && "Only v4f32 and v2f64 are supported!"); 07240 07241 // For integer shuffles we need to expand the mask and cast the inputs to 07242 // v8i16s prior to blending. 07243 assert((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) && 07244 "Not a supported integer vector type!"); 07245 int Scale = 8 / VT.getVectorNumElements(); 07246 BlendMask = 0; 07247 for (int i = 0, Size = Mask.size(); i < Size; ++i) 07248 if (Mask[i] >= Size) 07249 for (int j = 0; j < Scale; ++j) 07250 BlendMask |= 1u << (i * Scale + j); 07251 07252 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); 07253 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); 07254 return DAG.getNode(ISD::BITCAST, DL, VT, 07255 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, 07256 DAG.getConstant(BlendMask, MVT::i8))); 07257 } 07258 07259 /// \brief Try to lower a vector shuffle as a byte rotation. 07260 /// 07261 /// We have a generic PALIGNR instruction in x86 that will do an arbitrary 07262 /// byte-rotation of a the concatentation of two vectors. This routine will 07263 /// try to generically lower a vector shuffle through such an instruction. It 07264 /// does not check for the availability of PALIGNR-based lowerings, only the 07265 /// applicability of this strategy to the given mask. This matches shuffle 07266 /// vectors that look like: 07267 /// 07268 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] 07269 /// 07270 /// Essentially it concatenates V1 and V2, shifts right by some number of 07271 /// elements, and takes the low elements as the result. Note that while this is 07272 /// specified as a *right shift* because x86 is little-endian, it is a *left 07273 /// rotate* of the vector lanes. 07274 /// 07275 /// Note that this only handles 128-bit vector widths currently. 07276 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, 07277 SDValue V2, 07278 ArrayRef<int> Mask, 07279 SelectionDAG &DAG) { 07280 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); 07281 07282 // We need to detect various ways of spelling a rotation: 07283 // [11, 12, 13, 14, 15, 0, 1, 2] 07284 // [-1, 12, 13, 14, -1, -1, 1, -1] 07285 // [-1, -1, -1, -1, -1, -1, 1, 2] 07286 // [ 3, 4, 5, 6, 7, 8, 9, 10] 07287 // [-1, 4, 5, 6, -1, -1, 9, -1] 07288 // [-1, 4, 5, 6, -1, -1, -1, -1] 07289 int Rotation = 0; 07290 SDValue Lo, Hi; 07291 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 07292 if (Mask[i] == -1) 07293 continue; 07294 assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); 07295 07296 // Based on the mod-Size value of this mask element determine where 07297 // a rotated vector would have started. 07298 int StartIdx = i - (Mask[i] % Size); 07299 if (StartIdx == 0) 07300 // The identity rotation isn't interesting, stop. 07301 return SDValue(); 07302 07303 // If we found the tail of a vector the rotation must be the missing 07304 // front. If we found the head of a vector, it must be how much of the head. 07305 int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; 07306 07307 if (Rotation == 0) 07308 Rotation = CandidateRotation; 07309 else if (Rotation != CandidateRotation) 07310 // The rotations don't match, so we can't match this mask. 07311 return SDValue(); 07312 07313 // Compute which value this mask is pointing at. 07314 SDValue MaskV = Mask[i] < Size ? V1 : V2; 07315 07316 // Compute which of the two target values this index should be assigned to. 07317 // This reflects whether the high elements are remaining or the low elements 07318 // are remaining. 07319 SDValue &TargetV = StartIdx < 0 ? Hi : Lo; 07320 07321 // Either set up this value if we've not encountered it before, or check 07322 // that it remains consistent. 07323 if (!TargetV) 07324 TargetV = MaskV; 07325 else if (TargetV != MaskV) 07326 // This may be a rotation, but it pulls from the inputs in some 07327 // unsupported interleaving. 07328 return SDValue(); 07329 } 07330 07331 // Check that we successfully analyzed the mask, and normalize the results. 07332 assert(Rotation != 0 && "Failed to locate a viable rotation!"); 07333 assert((Lo || Hi) && "Failed to find a rotated input vector!"); 07334 if (!Lo) 07335 Lo = Hi; 07336 else if (!Hi) 07337 Hi = Lo; 07338 07339 // Cast the inputs to v16i8 to match PALIGNR. 07340 Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); 07341 Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); 07342 07343 assert(VT.getSizeInBits() == 128 && 07344 "Rotate-based lowering only supports 128-bit lowering!"); 07345 assert(Mask.size() <= 16 && 07346 "Can shuffle at most 16 bytes in a 128-bit vector!"); 07347 // The actual rotate instruction rotates bytes, so we need to scale the 07348 // rotation based on how many bytes are in the vector. 07349 int Scale = 16 / Mask.size(); 07350 07351 return DAG.getNode(ISD::BITCAST, DL, VT, 07352 DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, 07353 DAG.getConstant(Rotation * Scale, MVT::i8))); 07354 } 07355 07356 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. 07357 /// 07358 /// This is the basis function for the 2-lane 64-bit shuffles as we have full 07359 /// support for floating point shuffles but not integer shuffles. These 07360 /// instructions will incur a domain crossing penalty on some chips though so 07361 /// it is better to avoid lowering through this for integer vectors where 07362 /// possible. 07363 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 07364 const X86Subtarget *Subtarget, 07365 SelectionDAG &DAG) { 07366 SDLoc DL(Op); 07367 assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); 07368 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 07369 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); 07370 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 07371 ArrayRef<int> Mask = SVOp->getMask(); 07372 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 07373 07374 if (isSingleInputShuffleMask(Mask)) { 07375 // Straight shuffle of a single input vector. Simulate this by using the 07376 // single input as both of the "inputs" to this instruction.. 07377 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); 07378 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, 07379 DAG.getConstant(SHUFPDMask, MVT::i8)); 07380 } 07381 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); 07382 assert(Mask[1] >= 2 && "Non-canonicalized blend!"); 07383 07384 // Use dedicated unpack instructions for masks that match their pattern. 07385 if (isShuffleEquivalent(Mask, 0, 2)) 07386 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); 07387 if (isShuffleEquivalent(Mask, 1, 3)) 07388 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); 07389 07390 if (Subtarget->hasSSE41()) 07391 if (SDValue Blend = 07392 lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG)) 07393 return Blend; 07394 07395 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); 07396 return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, 07397 DAG.getConstant(SHUFPDMask, MVT::i8)); 07398 } 07399 07400 /// \brief Handle lowering of 2-lane 64-bit integer shuffles. 07401 /// 07402 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by 07403 /// the integer unit to minimize domain crossing penalties. However, for blends 07404 /// it falls back to the floating point shuffle operation with appropriate bit 07405 /// casting. 07406 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 07407 const X86Subtarget *Subtarget, 07408 SelectionDAG &DAG) { 07409 SDLoc DL(Op); 07410 assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); 07411 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 07412 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); 07413 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 07414 ArrayRef<int> Mask = SVOp->getMask(); 07415 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); 07416 07417 if (isSingleInputShuffleMask(Mask)) { 07418 // Straight shuffle of a single input vector. For everything from SSE2 07419 // onward this has a single fast instruction with no scary immediates. 07420 // We have to map the mask as it is actually a v4i32 shuffle instruction. 07421 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); 07422 int WidenedMask[4] = { 07423 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, 07424 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; 07425 return DAG.getNode( 07426 ISD::BITCAST, DL, MVT::v2i64, 07427 DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, 07428 getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); 07429 } 07430 07431 // Use dedicated unpack instructions for masks that match their pattern. 07432 if (isShuffleEquivalent(Mask, 0, 2)) 07433 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); 07434 if (isShuffleEquivalent(Mask, 1, 3)) 07435 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); 07436 07437 if (Subtarget->hasSSE41()) 07438 if (SDValue Blend = 07439 lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG)) 07440 return Blend; 07441 07442 // We implement this with SHUFPD which is pretty lame because it will likely 07443 // incur 2 cycles of stall for integer vectors on Nehalem and older chips. 07444 // However, all the alternatives are still more cycles and newer chips don't 07445 // have this problem. It would be really nice if x86 had better shuffles here. 07446 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); 07447 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); 07448 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, 07449 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); 07450 } 07451 07452 /// \brief Lower 4-lane 32-bit floating point shuffles. 07453 /// 07454 /// Uses instructions exclusively from the floating point unit to minimize 07455 /// domain crossing penalties, as these are sufficient to implement all v4f32 07456 /// shuffles. 07457 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 07458 const X86Subtarget *Subtarget, 07459 SelectionDAG &DAG) { 07460 SDLoc DL(Op); 07461 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); 07462 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 07463 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); 07464 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 07465 ArrayRef<int> Mask = SVOp->getMask(); 07466 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 07467 07468 SDValue LowV = V1, HighV = V2; 07469 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; 07470 07471 int NumV2Elements = 07472 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 07473 07474 if (NumV2Elements == 0) 07475 // Straight shuffle of a single input vector. We pass the input vector to 07476 // both operands to simulate this with a SHUFPS. 07477 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, 07478 getV4X86ShuffleImm8ForMask(Mask, DAG)); 07479 07480 // Use dedicated unpack instructions for masks that match their pattern. 07481 if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) 07482 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); 07483 if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) 07484 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); 07485 07486 if (Subtarget->hasSSE41()) 07487 if (SDValue Blend = 07488 lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG)) 07489 return Blend; 07490 07491 if (NumV2Elements == 1) { 07492 int V2Index = 07493 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - 07494 Mask.begin(); 07495 07496 // Check for whether we can use INSERTPS to perform the blend. We only use 07497 // INSERTPS when the V1 elements are already in the correct locations 07498 // because otherwise we can just always use two SHUFPS instructions which 07499 // are much smaller to encode than a SHUFPS and an INSERTPS. 07500 if (Subtarget->hasSSE41()) { 07501 // When using INSERTPS we can zero any lane of the destination. Collect 07502 // the zero inputs into a mask and drop them from the lanes of V1 which 07503 // actually need to be present as inputs to the INSERTPS. 07504 unsigned ZMask = 0; 07505 if (ISD::isBuildVectorAllZeros(V1.getNode())) { 07506 ZMask = 0xF ^ (1 << V2Index); 07507 } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { 07508 for (int i = 0; i < 4; ++i) { 07509 int M = Mask[i]; 07510 if (M >= 4) 07511 continue; 07512 if (M > -1) { 07513 SDValue Input = V1.getOperand(M); 07514 if (Input.getOpcode() != ISD::UNDEF && 07515 !X86::isZeroNode(Input)) { 07516 // A non-zero input! 07517 ZMask = 0; 07518 break; 07519 } 07520 } 07521 ZMask |= 1 << i; 07522 } 07523 } 07524 07525 // Synthesize a shuffle mask for the non-zero and non-v2 inputs. 07526 int InsertShuffleMask[4] = {-1, -1, -1, -1}; 07527 for (int i = 0; i < 4; ++i) 07528 if (i != V2Index && (ZMask & (1 << i)) == 0) 07529 InsertShuffleMask[i] = Mask[i]; 07530 07531 if (isNoopShuffleMask(InsertShuffleMask)) { 07532 // Replace V1 with undef if nothing from V1 survives the INSERTPS. 07533 if ((ZMask | 1 << V2Index) == 0xF) 07534 V1 = DAG.getUNDEF(MVT::v4f32); 07535 07536 unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask; 07537 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); 07538 07539 // Insert the V2 element into the desired position. 07540 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, 07541 DAG.getConstant(InsertPSMask, MVT::i8)); 07542 } 07543 } 07544 07545 // Compute the index adjacent to V2Index and in the same half by toggling 07546 // the low bit. 07547 int V2AdjIndex = V2Index ^ 1; 07548 07549 if (Mask[V2AdjIndex] == -1) { 07550 // Handles all the cases where we have a single V2 element and an undef. 07551 // This will only ever happen in the high lanes because we commute the 07552 // vector otherwise. 07553 if (V2Index < 2) 07554 std::swap(LowV, HighV); 07555 NewMask[V2Index] -= 4; 07556 } else { 07557 // Handle the case where the V2 element ends up adjacent to a V1 element. 07558 // To make this work, blend them together as the first step. 07559 int V1Index = V2AdjIndex; 07560 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; 07561 V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, 07562 getV4X86ShuffleImm8ForMask(BlendMask, DAG)); 07563 07564 // Now proceed to reconstruct the final blend as we have the necessary 07565 // high or low half formed. 07566 if (V2Index < 2) { 07567 LowV = V2; 07568 HighV = V1; 07569 } else { 07570 HighV = V2; 07571 } 07572 NewMask[V1Index] = 2; // We put the V1 element in V2[2]. 07573 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. 07574 } 07575 } else if (NumV2Elements == 2) { 07576 if (Mask[0] < 4 && Mask[1] < 4) { 07577 // Handle the easy case where we have V1 in the low lanes and V2 in the 07578 // high lanes. We never see this reversed because we sort the shuffle. 07579 NewMask[2] -= 4; 07580 NewMask[3] -= 4; 07581 } else { 07582 // We have a mixture of V1 and V2 in both low and high lanes. Rather than 07583 // trying to place elements directly, just blend them and set up the final 07584 // shuffle to place them. 07585 07586 // The first two blend mask elements are for V1, the second two are for 07587 // V2. 07588 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], 07589 Mask[2] < 4 ? Mask[2] : Mask[3], 07590 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, 07591 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; 07592 V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, 07593 getV4X86ShuffleImm8ForMask(BlendMask, DAG)); 07594 07595 // Now we do a normal shuffle of V1 by giving V1 as both operands to 07596 // a blend. 07597 LowV = HighV = V1; 07598 NewMask[0] = Mask[0] < 4 ? 0 : 2; 07599 NewMask[1] = Mask[0] < 4 ? 2 : 0; 07600 NewMask[2] = Mask[2] < 4 ? 1 : 3; 07601 NewMask[3] = Mask[2] < 4 ? 3 : 1; 07602 } 07603 } 07604 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, 07605 getV4X86ShuffleImm8ForMask(NewMask, DAG)); 07606 } 07607 07608 static SDValue lowerIntegerElementInsertionVectorShuffle( 07609 MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, 07610 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 07611 int V2Index = std::find_if(Mask.begin(), Mask.end(), 07612 [&Mask](int M) { return M >= (int)Mask.size(); }) - 07613 Mask.begin(); 07614 07615 // Check for a single input from a SCALAR_TO_VECTOR node. 07616 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and 07617 // all the smarts here sunk into that routine. However, the current 07618 // lowering of BUILD_VECTOR makes that nearly impossible until the old 07619 // vector shuffle lowering is dead. 07620 if ((Mask[V2Index] == (int)Mask.size() && 07621 V2.getOpcode() == ISD::SCALAR_TO_VECTOR) || 07622 V2.getOpcode() == ISD::BUILD_VECTOR) { 07623 SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size()); 07624 07625 bool V1IsAllZero = false; 07626 if (ISD::isBuildVectorAllZeros(V1.getNode())) { 07627 V1IsAllZero = true; 07628 } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { 07629 V1IsAllZero = true; 07630 for (int M : Mask) { 07631 if (M < 0 || M >= (int)Mask.size()) 07632 continue; 07633 SDValue Input = V1.getOperand(M); 07634 if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) { 07635 // A non-zero input! 07636 V1IsAllZero = false; 07637 break; 07638 } 07639 } 07640 } 07641 if (V1IsAllZero) { 07642 // First, we need to zext the scalar if it is smaller than an i32. 07643 MVT EltVT = VT.getVectorElementType(); 07644 assert(EltVT == V2S.getSimpleValueType() && 07645 "Different scalar and element types!"); 07646 MVT ExtVT = VT; 07647 if (EltVT == MVT::i8 || EltVT == MVT::i16) { 07648 // Zero-extend directly to i32. 07649 ExtVT = MVT::v4i32; 07650 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); 07651 } 07652 07653 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, 07654 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S)); 07655 if (ExtVT != VT) 07656 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); 07657 07658 if (V2Index != 0) { 07659 // If we have 4 or fewer lanes we can cheaply shuffle the element into 07660 // the desired position. Otherwise it is more efficient to do a vector 07661 // shift left. We know that we can do a vector shift left because all 07662 // the inputs are zero. 07663 if (VT.getVectorNumElements() <= 4) { 07664 SmallVector<int, 4> V2Shuffle(Mask.size(), 1); 07665 V2Shuffle[V2Index] = 0; 07666 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); 07667 } else { 07668 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); 07669 V2 = DAG.getNode( 07670 X86ISD::VSHLDQ, DL, MVT::v2i64, V2, 07671 DAG.getConstant( 07672 V2Index * EltVT.getSizeInBits(), 07673 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); 07674 V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); 07675 } 07676 } 07677 return V2; 07678 } 07679 } 07680 return SDValue(); 07681 } 07682 07683 /// \brief Lower 4-lane i32 vector shuffles. 07684 /// 07685 /// We try to handle these with integer-domain shuffles where we can, but for 07686 /// blends we use the floating point domain blend instructions. 07687 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 07688 const X86Subtarget *Subtarget, 07689 SelectionDAG &DAG) { 07690 SDLoc DL(Op); 07691 assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); 07692 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 07693 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); 07694 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 07695 ArrayRef<int> Mask = SVOp->getMask(); 07696 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 07697 07698 int NumV2Elements = 07699 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); 07700 07701 if (NumV2Elements == 0) { 07702 // Straight shuffle of a single input vector. For everything from SSE2 07703 // onward this has a single fast instruction with no scary immediates. 07704 // We coerce the shuffle pattern to be compatible with UNPCK instructions 07705 // but we aren't actually going to use the UNPCK instruction because doing 07706 // so prevents folding a load into this instruction or making a copy. 07707 const int UnpackLoMask[] = {0, 0, 1, 1}; 07708 const int UnpackHiMask[] = {2, 2, 3, 3}; 07709 if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) 07710 Mask = UnpackLoMask; 07711 else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) 07712 Mask = UnpackHiMask; 07713 07714 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, 07715 getV4X86ShuffleImm8ForMask(Mask, DAG)); 07716 } 07717 07718 // Use dedicated unpack instructions for masks that match their pattern. 07719 if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) 07720 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); 07721 if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) 07722 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); 07723 07724 // There are special ways we can lower some single-element blends. 07725 if (NumV2Elements == 1) 07726 if (SDValue V = lowerIntegerElementInsertionVectorShuffle( 07727 MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) 07728 return V; 07729 07730 if (Subtarget->hasSSE41()) 07731 if (SDValue Blend = 07732 lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG)) 07733 return Blend; 07734 07735 // We implement this with SHUFPS because it can blend from two vectors. 07736 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build 07737 // up the inputs, bypassing domain shift penalties that we would encur if we 07738 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't 07739 // relevant. 07740 return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, 07741 DAG.getVectorShuffle( 07742 MVT::v4f32, DL, 07743 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), 07744 DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); 07745 } 07746 07747 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 07748 /// shuffle lowering, and the most complex part. 07749 /// 07750 /// The lowering strategy is to try to form pairs of input lanes which are 07751 /// targeted at the same half of the final vector, and then use a dword shuffle 07752 /// to place them onto the right half, and finally unpack the paired lanes into 07753 /// their final position. 07754 /// 07755 /// The exact breakdown of how to form these dword pairs and align them on the 07756 /// correct sides is really tricky. See the comments within the function for 07757 /// more of the details. 07758 static SDValue lowerV8I16SingleInputVectorShuffle( 07759 SDLoc DL, SDValue V, MutableArrayRef<int> Mask, 07760 const X86Subtarget *Subtarget, SelectionDAG &DAG) { 07761 assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); 07762 MutableArrayRef<int> LoMask = Mask.slice(0, 4); 07763 MutableArrayRef<int> HiMask = Mask.slice(4, 4); 07764 07765 SmallVector<int, 4> LoInputs; 07766 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), 07767 [](int M) { return M >= 0; }); 07768 std::sort(LoInputs.begin(), LoInputs.end()); 07769 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); 07770 SmallVector<int, 4> HiInputs; 07771 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), 07772 [](int M) { return M >= 0; }); 07773 std::sort(HiInputs.begin(), HiInputs.end()); 07774 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); 07775 int NumLToL = 07776 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); 07777 int NumHToL = LoInputs.size() - NumLToL; 07778 int NumLToH = 07779 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); 07780 int NumHToH = HiInputs.size() - NumLToH; 07781 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); 07782 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); 07783 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); 07784 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); 07785 07786 // Use dedicated unpack instructions for masks that match their pattern. 07787 if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) 07788 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); 07789 if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) 07790 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); 07791 07792 // Try to use rotation instructions if available. 07793 if (Subtarget->hasSSSE3()) 07794 if (SDValue Rotate = lowerVectorShuffleAsByteRotate( 07795 DL, MVT::v8i16, V, V, Mask, DAG)) 07796 return Rotate; 07797 07798 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all 07799 // such inputs we can swap two of the dwords across the half mark and end up 07800 // with <=2 inputs to each half in each half. Once there, we can fall through 07801 // to the generic code below. For example: 07802 // 07803 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 07804 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] 07805 // 07806 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half 07807 // and an existing 2-into-2 on the other half. In this case we may have to 07808 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or 07809 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. 07810 // Fortunately, we don't have to handle anything but a 2-into-2 pattern 07811 // because any other situation (including a 3-into-1 or 1-into-3 in the other 07812 // half than the one we target for fixing) will be fixed when we re-enter this 07813 // path. We will also combine away any sequence of PSHUFD instructions that 07814 // result into a single instruction. Here is an example of the tricky case: 07815 // 07816 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] 07817 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] 07818 // 07819 // This now has a 1-into-3 in the high half! Instead, we do two shuffles: 07820 // 07821 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] 07822 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] 07823 // 07824 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] 07825 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] 07826 // 07827 // The result is fine to be handled by the generic logic. 07828 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, 07829 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, 07830 int AOffset, int BOffset) { 07831 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && 07832 "Must call this with A having 3 or 1 inputs from the A half."); 07833 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && 07834 "Must call this with B having 1 or 3 inputs from the B half."); 07835 assert(AToAInputs.size() + BToAInputs.size() == 4 && 07836 "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); 07837 07838 // Compute the index of dword with only one word among the three inputs in 07839 // a half by taking the sum of the half with three inputs and subtracting 07840 // the sum of the actual three inputs. The difference is the remaining 07841 // slot. 07842 int ADWord, BDWord; 07843 int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; 07844 int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; 07845 int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; 07846 ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; 07847 int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; 07848 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); 07849 int TripleNonInputIdx = 07850 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); 07851 TripleDWord = TripleNonInputIdx / 2; 07852 07853 // We use xor with one to compute the adjacent DWord to whichever one the 07854 // OneInput is in. 07855 OneInputDWord = (OneInput / 2) ^ 1; 07856 07857 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA 07858 // and BToA inputs. If there is also such a problem with the BToB and AToB 07859 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in 07860 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it 07861 // is essential that we don't *create* a 3<-1 as then we might oscillate. 07862 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { 07863 // Compute how many inputs will be flipped by swapping these DWords. We 07864 // need 07865 // to balance this to ensure we don't form a 3-1 shuffle in the other 07866 // half. 07867 int NumFlippedAToBInputs = 07868 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + 07869 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); 07870 int NumFlippedBToBInputs = 07871 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + 07872 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); 07873 if ((NumFlippedAToBInputs == 1 && 07874 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || 07875 (NumFlippedBToBInputs == 1 && 07876 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { 07877 // We choose whether to fix the A half or B half based on whether that 07878 // half has zero flipped inputs. At zero, we may not be able to fix it 07879 // with that half. We also bias towards fixing the B half because that 07880 // will more commonly be the high half, and we have to bias one way. 07881 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, 07882 ArrayRef<int> Inputs) { 07883 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. 07884 bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), 07885 PinnedIdx ^ 1) != Inputs.end(); 07886 // Determine whether the free index is in the flipped dword or the 07887 // unflipped dword based on where the pinned index is. We use this bit 07888 // in an xor to conditionally select the adjacent dword. 07889 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); 07890 bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 07891 FixFreeIdx) != Inputs.end(); 07892 if (IsFixIdxInput == IsFixFreeIdxInput) 07893 FixFreeIdx += 1; 07894 IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), 07895 FixFreeIdx) != Inputs.end(); 07896 assert(IsFixIdxInput != IsFixFreeIdxInput && 07897 "We need to be changing the number of flipped inputs!"); 07898 int PSHUFHalfMask[] = {0, 1, 2, 3}; 07899 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); 07900 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, 07901 MVT::v8i16, V, 07902 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); 07903 07904 for (int &M : Mask) 07905 if (M != -1 && M == FixIdx) 07906 M = FixFreeIdx; 07907 else if (M != -1 && M == FixFreeIdx) 07908 M = FixIdx; 07909 }; 07910 if (NumFlippedBToBInputs != 0) { 07911 int BPinnedIdx = 07912 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; 07913 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); 07914 } else { 07915 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); 07916 int APinnedIdx = 07917 AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; 07918 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); 07919 } 07920 } 07921 } 07922 07923 int PSHUFDMask[] = {0, 1, 2, 3}; 07924 PSHUFDMask[ADWord] = BDWord; 07925 PSHUFDMask[BDWord] = ADWord; 07926 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 07927 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 07928 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), 07929 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); 07930 07931 // Adjust the mask to match the new locations of A and B. 07932 for (int &M : Mask) 07933 if (M != -1 && M/2 == ADWord) 07934 M = 2 * BDWord + M % 2; 07935 else if (M != -1 && M/2 == BDWord) 07936 M = 2 * ADWord + M % 2; 07937 07938 // Recurse back into this routine to re-compute state now that this isn't 07939 // a 3 and 1 problem. 07940 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), 07941 Mask); 07942 }; 07943 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) 07944 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); 07945 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) 07946 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); 07947 07948 // At this point there are at most two inputs to the low and high halves from 07949 // each half. That means the inputs can always be grouped into dwords and 07950 // those dwords can then be moved to the correct half with a dword shuffle. 07951 // We use at most one low and one high word shuffle to collect these paired 07952 // inputs into dwords, and finally a dword shuffle to place them. 07953 int PSHUFLMask[4] = {-1, -1, -1, -1}; 07954 int PSHUFHMask[4] = {-1, -1, -1, -1}; 07955 int PSHUFDMask[4] = {-1, -1, -1, -1}; 07956 07957 // First fix the masks for all the inputs that are staying in their 07958 // original halves. This will then dictate the targets of the cross-half 07959 // shuffles. 07960 auto fixInPlaceInputs = 07961 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, 07962 MutableArrayRef<int> SourceHalfMask, 07963 MutableArrayRef<int> HalfMask, int HalfOffset) { 07964 if (InPlaceInputs.empty()) 07965 return; 07966 if (InPlaceInputs.size() == 1) { 07967 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 07968 InPlaceInputs[0] - HalfOffset; 07969 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; 07970 return; 07971 } 07972 if (IncomingInputs.empty()) { 07973 // Just fix all of the in place inputs. 07974 for (int Input : InPlaceInputs) { 07975 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; 07976 PSHUFDMask[Input / 2] = Input / 2; 07977 } 07978 return; 07979 } 07980 07981 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); 07982 SourceHalfMask[InPlaceInputs[0] - HalfOffset] = 07983 InPlaceInputs[0] - HalfOffset; 07984 // Put the second input next to the first so that they are packed into 07985 // a dword. We find the adjacent index by toggling the low bit. 07986 int AdjIndex = InPlaceInputs[0] ^ 1; 07987 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; 07988 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); 07989 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; 07990 }; 07991 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); 07992 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); 07993 07994 // Now gather the cross-half inputs and place them into a free dword of 07995 // their target half. 07996 // FIXME: This operation could almost certainly be simplified dramatically to 07997 // look more like the 3-1 fixing operation. 07998 auto moveInputsToRightHalf = [&PSHUFDMask]( 07999 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, 08000 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, 08001 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, 08002 int DestOffset) { 08003 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { 08004 return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; 08005 }; 08006 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, 08007 int Word) { 08008 int LowWord = Word & ~1; 08009 int HighWord = Word | 1; 08010 return isWordClobbered(SourceHalfMask, LowWord) || 08011 isWordClobbered(SourceHalfMask, HighWord); 08012 }; 08013 08014 if (IncomingInputs.empty()) 08015 return; 08016 08017 if (ExistingInputs.empty()) { 08018 // Map any dwords with inputs from them into the right half. 08019 for (int Input : IncomingInputs) { 08020 // If the source half mask maps over the inputs, turn those into 08021 // swaps and use the swapped lane. 08022 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { 08023 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { 08024 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = 08025 Input - SourceOffset; 08026 // We have to swap the uses in our half mask in one sweep. 08027 for (int &M : HalfMask) 08028 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) 08029 M = Input; 08030 else if (M == Input) 08031 M = SourceHalfMask[Input - SourceOffset] + SourceOffset; 08032 } else { 08033 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == 08034 Input - SourceOffset && 08035 "Previous placement doesn't match!"); 08036 } 08037 // Note that this correctly re-maps both when we do a swap and when 08038 // we observe the other side of the swap above. We rely on that to 08039 // avoid swapping the members of the input list directly. 08040 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; 08041 } 08042 08043 // Map the input's dword into the correct half. 08044 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) 08045 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; 08046 else 08047 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == 08048 Input / 2 && 08049 "Previous placement doesn't match!"); 08050 } 08051 08052 // And just directly shift any other-half mask elements to be same-half 08053 // as we will have mirrored the dword containing the element into the 08054 // same position within that half. 08055 for (int &M : HalfMask) 08056 if (M >= SourceOffset && M < SourceOffset + 4) { 08057 M = M - SourceOffset + DestOffset; 08058 assert(M >= 0 && "This should never wrap below zero!"); 08059 } 08060 return; 08061 } 08062 08063 // Ensure we have the input in a viable dword of its current half. This 08064 // is particularly tricky because the original position may be clobbered 08065 // by inputs being moved and *staying* in that half. 08066 if (IncomingInputs.size() == 1) { 08067 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 08068 int InputFixed = std::find(std::begin(SourceHalfMask), 08069 std::end(SourceHalfMask), -1) - 08070 std::begin(SourceHalfMask) + SourceOffset; 08071 SourceHalfMask[InputFixed - SourceOffset] = 08072 IncomingInputs[0] - SourceOffset; 08073 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], 08074 InputFixed); 08075 IncomingInputs[0] = InputFixed; 08076 } 08077 } else if (IncomingInputs.size() == 2) { 08078 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || 08079 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { 08080 // We have two non-adjacent or clobbered inputs we need to extract from 08081 // the source half. To do this, we need to map them into some adjacent 08082 // dword slot in the source mask. 08083 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, 08084 IncomingInputs[1] - SourceOffset}; 08085 08086 // If there is a free slot in the source half mask adjacent to one of 08087 // the inputs, place the other input in it. We use (Index XOR 1) to 08088 // compute an adjacent index. 08089 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && 08090 SourceHalfMask[InputsFixed[0] ^ 1] == -1) { 08091 SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; 08092 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 08093 InputsFixed[1] = InputsFixed[0] ^ 1; 08094 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && 08095 SourceHalfMask[InputsFixed[1] ^ 1] == -1) { 08096 SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; 08097 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; 08098 InputsFixed[0] = InputsFixed[1] ^ 1; 08099 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && 08100 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { 08101 // The two inputs are in the same DWord but it is clobbered and the 08102 // adjacent DWord isn't used at all. Move both inputs to the free 08103 // slot. 08104 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; 08105 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; 08106 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); 08107 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; 08108 } else { 08109 // The only way we hit this point is if there is no clobbering 08110 // (because there are no off-half inputs to this half) and there is no 08111 // free slot adjacent to one of the inputs. In this case, we have to 08112 // swap an input with a non-input. 08113 for (int i = 0; i < 4; ++i) 08114 assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && 08115 "We can't handle any clobbers here!"); 08116 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && 08117 "Cannot have adjacent inputs here!"); 08118 08119 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; 08120 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; 08121 08122 // We also have to update the final source mask in this case because 08123 // it may need to undo the above swap. 08124 for (int &M : FinalSourceHalfMask) 08125 if (M == (InputsFixed[0] ^ 1) + SourceOffset) 08126 M = InputsFixed[1] + SourceOffset; 08127 else if (M == InputsFixed[1] + SourceOffset) 08128 M = (InputsFixed[0] ^ 1) + SourceOffset; 08129 08130 InputsFixed[1] = InputsFixed[0] ^ 1; 08131 } 08132 08133 // Point everything at the fixed inputs. 08134 for (int &M : HalfMask) 08135 if (M == IncomingInputs[0]) 08136 M = InputsFixed[0] + SourceOffset; 08137 else if (M == IncomingInputs[1]) 08138 M = InputsFixed[1] + SourceOffset; 08139 08140 IncomingInputs[0] = InputsFixed[0] + SourceOffset; 08141 IncomingInputs[1] = InputsFixed[1] + SourceOffset; 08142 } 08143 } else { 08144 llvm_unreachable("Unhandled input size!"); 08145 } 08146 08147 // Now hoist the DWord down to the right half. 08148 int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; 08149 assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); 08150 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; 08151 for (int &M : HalfMask) 08152 for (int Input : IncomingInputs) 08153 if (M == Input) 08154 M = FreeDWord * 2 + Input % 2; 08155 }; 08156 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, 08157 /*SourceOffset*/ 4, /*DestOffset*/ 0); 08158 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, 08159 /*SourceOffset*/ 0, /*DestOffset*/ 4); 08160 08161 // Now enact all the shuffles we've computed to move the inputs into their 08162 // target half. 08163 if (!isNoopShuffleMask(PSHUFLMask)) 08164 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, 08165 getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); 08166 if (!isNoopShuffleMask(PSHUFHMask)) 08167 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, 08168 getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); 08169 if (!isNoopShuffleMask(PSHUFDMask)) 08170 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 08171 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, 08172 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), 08173 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); 08174 08175 // At this point, each half should contain all its inputs, and we can then 08176 // just shuffle them into their final position. 08177 assert(std::count_if(LoMask.begin(), LoMask.end(), 08178 [](int M) { return M >= 4; }) == 0 && 08179 "Failed to lift all the high half inputs to the low mask!"); 08180 assert(std::count_if(HiMask.begin(), HiMask.end(), 08181 [](int M) { return M >= 0 && M < 4; }) == 0 && 08182 "Failed to lift all the low half inputs to the high mask!"); 08183 08184 // Do a half shuffle for the low mask. 08185 if (!isNoopShuffleMask(LoMask)) 08186 V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, 08187 getV4X86ShuffleImm8ForMask(LoMask, DAG)); 08188 08189 // Do a half shuffle with the high mask after shifting its values down. 08190 for (int &M : HiMask) 08191 if (M >= 0) 08192 M -= 4; 08193 if (!isNoopShuffleMask(HiMask)) 08194 V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, 08195 getV4X86ShuffleImm8ForMask(HiMask, DAG)); 08196 08197 return V; 08198 } 08199 08200 /// \brief Detect whether the mask pattern should be lowered through 08201 /// interleaving. 08202 /// 08203 /// This essentially tests whether viewing the mask as an interleaving of two 08204 /// sub-sequences reduces the cross-input traffic of a blend operation. If so, 08205 /// lowering it through interleaving is a significantly better strategy. 08206 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) { 08207 int NumEvenInputs[2] = {0, 0}; 08208 int NumOddInputs[2] = {0, 0}; 08209 int NumLoInputs[2] = {0, 0}; 08210 int NumHiInputs[2] = {0, 0}; 08211 for (int i = 0, Size = Mask.size(); i < Size; ++i) { 08212 if (Mask[i] < 0) 08213 continue; 08214 08215 int InputIdx = Mask[i] >= Size; 08216 08217 if (i < Size / 2) 08218 ++NumLoInputs[InputIdx]; 08219 else 08220 ++NumHiInputs[InputIdx]; 08221 08222 if ((i % 2) == 0) 08223 ++NumEvenInputs[InputIdx]; 08224 else 08225 ++NumOddInputs[InputIdx]; 08226 } 08227 08228 // The minimum number of cross-input results for both the interleaved and 08229 // split cases. If interleaving results in fewer cross-input results, return 08230 // true. 08231 int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], 08232 NumEvenInputs[0] + NumOddInputs[1]); 08233 int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], 08234 NumLoInputs[0] + NumHiInputs[1]); 08235 return InterleavedCrosses < SplitCrosses; 08236 } 08237 08238 /// \brief Blend two v8i16 vectors using a naive unpack strategy. 08239 /// 08240 /// This strategy only works when the inputs from each vector fit into a single 08241 /// half of that vector, and generally there are not so many inputs as to leave 08242 /// the in-place shuffles required highly constrained (and thus expensive). It 08243 /// shifts all the inputs into a single side of both input vectors and then 08244 /// uses an unpack to interleave these inputs in a single vector. At that 08245 /// point, we will fall back on the generic single input shuffle lowering. 08246 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, 08247 SDValue V2, 08248 MutableArrayRef<int> Mask, 08249 const X86Subtarget *Subtarget, 08250 SelectionDAG &DAG) { 08251 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); 08252 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); 08253 SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; 08254 for (int i = 0; i < 8; ++i) 08255 if (Mask[i] >= 0 && Mask[i] < 4) 08256 LoV1Inputs.push_back(i); 08257 else if (Mask[i] >= 4 && Mask[i] < 8) 08258 HiV1Inputs.push_back(i); 08259 else if (Mask[i] >= 8 && Mask[i] < 12) 08260 LoV2Inputs.push_back(i); 08261 else if (Mask[i] >= 12) 08262 HiV2Inputs.push_back(i); 08263 08264 int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); 08265 int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); 08266 (void)NumV1Inputs; 08267 (void)NumV2Inputs; 08268 assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); 08269 assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); 08270 assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); 08271 08272 bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= 08273 HiV1Inputs.size() + HiV2Inputs.size(); 08274 08275 auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs, 08276 ArrayRef<int> HiInputs, bool MoveToLo, 08277 int MaskOffset) { 08278 ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs; 08279 ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs; 08280 if (BadInputs.empty()) 08281 return V; 08282 08283 int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08284 int MoveOffset = MoveToLo ? 0 : 4; 08285 08286 if (GoodInputs.empty()) { 08287 for (int BadInput : BadInputs) { 08288 MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; 08289 Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; 08290 } 08291 } else { 08292 if (GoodInputs.size() == 2) { 08293 // If the low inputs are spread across two dwords, pack them into 08294 // a single dword. 08295 MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; 08296 MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; 08297 Mask[GoodInputs[0]] = MoveOffset + MaskOffset; 08298 Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; 08299 } else { 08300 // Otherwise pin the good inputs. 08301 for (int GoodInput : GoodInputs) 08302 MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; 08303 } 08304 08305 if (BadInputs.size() == 2) { 08306 // If we have two bad inputs then there may be either one or two good 08307 // inputs fixed in place. Find a fixed input, and then find the *other* 08308 // two adjacent indices by using modular arithmetic. 08309 int GoodMaskIdx = 08310 std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), 08311 [](int M) { return M >= 0; }) - 08312 std::begin(MoveMask); 08313 int MoveMaskIdx = 08314 ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; 08315 assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); 08316 assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); 08317 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; 08318 MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; 08319 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; 08320 Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; 08321 } else { 08322 assert(BadInputs.size() == 1 && "All sizes handled"); 08323 int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, 08324 std::end(MoveMask), -1) - 08325 std::begin(MoveMask); 08326 MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; 08327 Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; 08328 } 08329 } 08330 08331 return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), 08332 MoveMask); 08333 }; 08334 V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, 08335 /*MaskOffset*/ 0); 08336 V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, 08337 /*MaskOffset*/ 8); 08338 08339 // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes 08340 // cross-half traffic in the final shuffle. 08341 08342 // Munge the mask to be a single-input mask after the unpack merges the 08343 // results. 08344 for (int &M : Mask) 08345 if (M != -1) 08346 M = 2 * (M % 4) + (M / 8); 08347 08348 return DAG.getVectorShuffle( 08349 MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, 08350 DL, MVT::v8i16, V1, V2), 08351 DAG.getUNDEF(MVT::v8i16), Mask); 08352 } 08353 08354 /// \brief Generic lowering of 8-lane i16 shuffles. 08355 /// 08356 /// This handles both single-input shuffles and combined shuffle/blends with 08357 /// two inputs. The single input shuffles are immediately delegated to 08358 /// a dedicated lowering routine. 08359 /// 08360 /// The blends are lowered in one of three fundamental ways. If there are few 08361 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle 08362 /// of the input is significantly cheaper when lowered as an interleaving of 08363 /// the two inputs, try to interleave them. Otherwise, blend the low and high 08364 /// halves of the inputs separately (making them have relatively few inputs) 08365 /// and then concatenate them. 08366 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 08367 const X86Subtarget *Subtarget, 08368 SelectionDAG &DAG) { 08369 SDLoc DL(Op); 08370 assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); 08371 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 08372 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); 08373 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 08374 ArrayRef<int> OrigMask = SVOp->getMask(); 08375 int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], 08376 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; 08377 MutableArrayRef<int> Mask(MaskStorage); 08378 08379 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); 08380 08381 auto isV1 = [](int M) { return M >= 0 && M < 8; }; 08382 auto isV2 = [](int M) { return M >= 8; }; 08383 08384 int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); 08385 int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); 08386 08387 if (NumV2Inputs == 0) 08388 return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); 08389 08390 assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " 08391 "to be V1-input shuffles."); 08392 08393 // There are special ways we can lower some single-element blends. 08394 if (NumV2Inputs == 1) 08395 if (SDValue V = lowerIntegerElementInsertionVectorShuffle( 08396 MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG)) 08397 return V; 08398 08399 if (Subtarget->hasSSE41()) 08400 if (SDValue Blend = 08401 lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) 08402 return Blend; 08403 08404 // Try to use rotation instructions if available. 08405 if (Subtarget->hasSSSE3()) 08406 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, DAG)) 08407 return Rotate; 08408 08409 if (NumV1Inputs + NumV2Inputs <= 4) 08410 return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); 08411 08412 // Check whether an interleaving lowering is likely to be more efficient. 08413 // This isn't perfect but it is a strong heuristic that tends to work well on 08414 // the kinds of shuffles that show up in practice. 08415 // 08416 // FIXME: Handle 1x, 2x, and 4x interleaving. 08417 if (shouldLowerAsInterleaving(Mask)) { 08418 // FIXME: Figure out whether we should pack these into the low or high 08419 // halves. 08420 08421 int EMask[8], OMask[8]; 08422 for (int i = 0; i < 4; ++i) { 08423 EMask[i] = Mask[2*i]; 08424 OMask[i] = Mask[2*i + 1]; 08425 EMask[i + 4] = -1; 08426 OMask[i + 4] = -1; 08427 } 08428 08429 SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); 08430 SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); 08431 08432 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); 08433 } 08434 08435 int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08436 int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08437 08438 for (int i = 0; i < 4; ++i) { 08439 LoBlendMask[i] = Mask[i]; 08440 HiBlendMask[i] = Mask[i + 4]; 08441 } 08442 08443 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); 08444 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); 08445 LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); 08446 HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); 08447 08448 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 08449 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); 08450 } 08451 08452 /// \brief Check whether a compaction lowering can be done by dropping even 08453 /// elements and compute how many times even elements must be dropped. 08454 /// 08455 /// This handles shuffles which take every Nth element where N is a power of 08456 /// two. Example shuffle masks: 08457 /// 08458 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 08459 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 08460 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 08461 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 08462 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 08463 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 08464 /// 08465 /// Any of these lanes can of course be undef. 08466 /// 08467 /// This routine only supports N <= 3. 08468 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here 08469 /// for larger N. 08470 /// 08471 /// \returns N above, or the number of times even elements must be dropped if 08472 /// there is such a number. Otherwise returns zero. 08473 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { 08474 // Figure out whether we're looping over two inputs or just one. 08475 bool IsSingleInput = isSingleInputShuffleMask(Mask); 08476 08477 // The modulus for the shuffle vector entries is based on whether this is 08478 // a single input or not. 08479 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); 08480 assert(isPowerOf2_32((uint32_t)ShuffleModulus) && 08481 "We should only be called with masks with a power-of-2 size!"); 08482 08483 uint64_t ModMask = (uint64_t)ShuffleModulus - 1; 08484 08485 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, 08486 // and 2^3 simultaneously. This is because we may have ambiguity with 08487 // partially undef inputs. 08488 bool ViableForN[3] = {true, true, true}; 08489 08490 for (int i = 0, e = Mask.size(); i < e; ++i) { 08491 // Ignore undef lanes, we'll optimistically collapse them to the pattern we 08492 // want. 08493 if (Mask[i] == -1) 08494 continue; 08495 08496 bool IsAnyViable = false; 08497 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 08498 if (ViableForN[j]) { 08499 uint64_t N = j + 1; 08500 08501 // The shuffle mask must be equal to (i * 2^N) % M. 08502 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) 08503 IsAnyViable = true; 08504 else 08505 ViableForN[j] = false; 08506 } 08507 // Early exit if we exhaust the possible powers of two. 08508 if (!IsAnyViable) 08509 break; 08510 } 08511 08512 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) 08513 if (ViableForN[j]) 08514 return j + 1; 08515 08516 // Return 0 as there is no viable power of two. 08517 return 0; 08518 } 08519 08520 /// \brief Generic lowering of v16i8 shuffles. 08521 /// 08522 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to 08523 /// detect any complexity reducing interleaving. If that doesn't help, it uses 08524 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses 08525 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them 08526 /// back together. 08527 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 08528 const X86Subtarget *Subtarget, 08529 SelectionDAG &DAG) { 08530 SDLoc DL(Op); 08531 assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); 08532 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 08533 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); 08534 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 08535 ArrayRef<int> OrigMask = SVOp->getMask(); 08536 assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); 08537 08538 // Try to use rotation instructions if available. 08539 if (Subtarget->hasSSSE3()) 08540 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, 08541 OrigMask, DAG)) 08542 return Rotate; 08543 08544 int MaskStorage[16] = { 08545 OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], 08546 OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], 08547 OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], 08548 OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; 08549 MutableArrayRef<int> Mask(MaskStorage); 08550 MutableArrayRef<int> LoMask = Mask.slice(0, 8); 08551 MutableArrayRef<int> HiMask = Mask.slice(8, 8); 08552 08553 int NumV2Elements = 08554 std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); 08555 08556 // For single-input shuffles, there are some nicer lowering tricks we can use. 08557 if (NumV2Elements == 0) { 08558 // Check whether we can widen this to an i16 shuffle by duplicating bytes. 08559 // Notably, this handles splat and partial-splat shuffles more efficiently. 08560 // However, it only makes sense if the pre-duplication shuffle simplifies 08561 // things significantly. Currently, this means we need to be able to 08562 // express the pre-duplication shuffle as an i16 shuffle. 08563 // 08564 // FIXME: We should check for other patterns which can be widened into an 08565 // i16 shuffle as well. 08566 auto canWidenViaDuplication = [](ArrayRef<int> Mask) { 08567 for (int i = 0; i < 16; i += 2) { 08568 if (Mask[i] != Mask[i + 1]) 08569 return false; 08570 } 08571 return true; 08572 }; 08573 auto tryToWidenViaDuplication = [&]() -> SDValue { 08574 if (!canWidenViaDuplication(Mask)) 08575 return SDValue(); 08576 SmallVector<int, 4> LoInputs; 08577 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), 08578 [](int M) { return M >= 0 && M < 8; }); 08579 std::sort(LoInputs.begin(), LoInputs.end()); 08580 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), 08581 LoInputs.end()); 08582 SmallVector<int, 4> HiInputs; 08583 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), 08584 [](int M) { return M >= 8; }); 08585 std::sort(HiInputs.begin(), HiInputs.end()); 08586 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), 08587 HiInputs.end()); 08588 08589 bool TargetLo = LoInputs.size() >= HiInputs.size(); 08590 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; 08591 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; 08592 08593 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08594 SmallDenseMap<int, int, 8> LaneMap; 08595 for (int I : InPlaceInputs) { 08596 PreDupI16Shuffle[I/2] = I/2; 08597 LaneMap[I] = I; 08598 } 08599 int j = TargetLo ? 0 : 4, je = j + 4; 08600 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { 08601 // Check if j is already a shuffle of this input. This happens when 08602 // there are two adjacent bytes after we move the low one. 08603 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { 08604 // If we haven't yet mapped the input, search for a slot into which 08605 // we can map it. 08606 while (j < je && PreDupI16Shuffle[j] != -1) 08607 ++j; 08608 08609 if (j == je) 08610 // We can't place the inputs into a single half with a simple i16 shuffle, so bail. 08611 return SDValue(); 08612 08613 // Map this input with the i16 shuffle. 08614 PreDupI16Shuffle[j] = MovingInputs[i] / 2; 08615 } 08616 08617 // Update the lane map based on the mapping we ended up with. 08618 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; 08619 } 08620 V1 = DAG.getNode( 08621 ISD::BITCAST, DL, MVT::v16i8, 08622 DAG.getVectorShuffle(MVT::v8i16, DL, 08623 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), 08624 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); 08625 08626 // Unpack the bytes to form the i16s that will be shuffled into place. 08627 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, 08628 MVT::v16i8, V1, V1); 08629 08630 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08631 for (int i = 0; i < 16; i += 2) { 08632 if (Mask[i] != -1) 08633 PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); 08634 assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); 08635 } 08636 return DAG.getNode( 08637 ISD::BITCAST, DL, MVT::v16i8, 08638 DAG.getVectorShuffle(MVT::v8i16, DL, 08639 DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), 08640 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); 08641 }; 08642 if (SDValue V = tryToWidenViaDuplication()) 08643 return V; 08644 } 08645 08646 // Check whether an interleaving lowering is likely to be more efficient. 08647 // This isn't perfect but it is a strong heuristic that tends to work well on 08648 // the kinds of shuffles that show up in practice. 08649 // 08650 // FIXME: We need to handle other interleaving widths (i16, i32, ...). 08651 if (shouldLowerAsInterleaving(Mask)) { 08652 // FIXME: Figure out whether we should pack these into the low or high 08653 // halves. 08654 08655 int EMask[16], OMask[16]; 08656 for (int i = 0; i < 8; ++i) { 08657 EMask[i] = Mask[2*i]; 08658 OMask[i] = Mask[2*i + 1]; 08659 EMask[i + 8] = -1; 08660 OMask[i + 8] = -1; 08661 } 08662 08663 SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); 08664 SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); 08665 08666 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); 08667 } 08668 08669 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly 08670 // with PSHUFB. It is important to do this before we attempt to generate any 08671 // blends but after all of the single-input lowerings. If the single input 08672 // lowerings can find an instruction sequence that is faster than a PSHUFB, we 08673 // want to preserve that and we can DAG combine any longer sequences into 08674 // a PSHUFB in the end. But once we start blending from multiple inputs, 08675 // the complexity of DAG combining bad patterns back into PSHUFB is too high, 08676 // and there are *very* few patterns that would actually be faster than the 08677 // PSHUFB approach because of its ability to zero lanes. 08678 // 08679 // FIXME: The only exceptions to the above are blends which are exact 08680 // interleavings with direct instructions supporting them. We currently don't 08681 // handle those well here. 08682 if (Subtarget->hasSSSE3()) { 08683 SDValue V1Mask[16]; 08684 SDValue V2Mask[16]; 08685 for (int i = 0; i < 16; ++i) 08686 if (Mask[i] == -1) { 08687 V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8); 08688 } else { 08689 V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8); 08690 V2Mask[i] = 08691 DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8); 08692 } 08693 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, 08694 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); 08695 if (isSingleInputShuffleMask(Mask)) 08696 return V1; // Single inputs are easy. 08697 08698 // Otherwise, blend the two. 08699 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, 08700 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); 08701 return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); 08702 } 08703 08704 // There are special ways we can lower some single-element blends. 08705 if (NumV2Elements == 1) 08706 if (SDValue V = lowerIntegerElementInsertionVectorShuffle( 08707 MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG)) 08708 return V; 08709 08710 // Check whether a compaction lowering can be done. This handles shuffles 08711 // which take every Nth element for some even N. See the helper function for 08712 // details. 08713 // 08714 // We special case these as they can be particularly efficiently handled with 08715 // the PACKUSB instruction on x86 and they show up in common patterns of 08716 // rearranging bytes to truncate wide elements. 08717 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { 08718 // NumEvenDrops is the power of two stride of the elements. Another way of 08719 // thinking about it is that we need to drop the even elements this many 08720 // times to get the original input. 08721 bool IsSingleInput = isSingleInputShuffleMask(Mask); 08722 08723 // First we need to zero all the dropped bytes. 08724 assert(NumEvenDrops <= 3 && 08725 "No support for dropping even elements more than 3 times."); 08726 // We use the mask type to pick which bytes are preserved based on how many 08727 // elements are dropped. 08728 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; 08729 SDValue ByteClearMask = 08730 DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, 08731 DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); 08732 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); 08733 if (!IsSingleInput) 08734 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); 08735 08736 // Now pack things back together. 08737 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); 08738 V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); 08739 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); 08740 for (int i = 1; i < NumEvenDrops; ++i) { 08741 Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); 08742 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); 08743 } 08744 08745 return Result; 08746 } 08747 08748 int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08749 int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08750 int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08751 int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; 08752 08753 auto buildBlendMasks = [](MutableArrayRef<int> HalfMask, 08754 MutableArrayRef<int> V1HalfBlendMask, 08755 MutableArrayRef<int> V2HalfBlendMask) { 08756 for (int i = 0; i < 8; ++i) 08757 if (HalfMask[i] >= 0 && HalfMask[i] < 16) { 08758 V1HalfBlendMask[i] = HalfMask[i]; 08759 HalfMask[i] = i; 08760 } else if (HalfMask[i] >= 16) { 08761 V2HalfBlendMask[i] = HalfMask[i] - 16; 08762 HalfMask[i] = i + 8; 08763 } 08764 }; 08765 buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); 08766 buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); 08767 08768 SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); 08769 08770 auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask, 08771 MutableArrayRef<int> HiBlendMask) { 08772 SDValue V1, V2; 08773 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask 08774 // them out and avoid using UNPCK{L,H} to extract the elements of V as 08775 // i16s. 08776 if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), 08777 [](int M) { return M >= 0 && M % 2 == 1; }) && 08778 std::none_of(HiBlendMask.begin(), HiBlendMask.end(), 08779 [](int M) { return M >= 0 && M % 2 == 1; })) { 08780 // Use a mask to drop the high bytes. 08781 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); 08782 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, 08783 DAG.getConstant(0x00FF, MVT::v8i16)); 08784 08785 // This will be a single vector shuffle instead of a blend so nuke V2. 08786 V2 = DAG.getUNDEF(MVT::v8i16); 08787 08788 // Squash the masks to point directly into V1. 08789 for (int &M : LoBlendMask) 08790 if (M >= 0) 08791 M /= 2; 08792 for (int &M : HiBlendMask) 08793 if (M >= 0) 08794 M /= 2; 08795 } else { 08796 // Otherwise just unpack the low half of V into V1 and the high half into 08797 // V2 so that we can blend them as i16s. 08798 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 08799 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); 08800 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, 08801 DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); 08802 } 08803 08804 SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); 08805 SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); 08806 return std::make_pair(BlendedLo, BlendedHi); 08807 }; 08808 SDValue V1Lo, V1Hi, V2Lo, V2Hi; 08809 std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); 08810 std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); 08811 08812 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); 08813 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); 08814 08815 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); 08816 } 08817 08818 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. 08819 /// 08820 /// This routine breaks down the specific type of 128-bit shuffle and 08821 /// dispatches to the lowering routines accordingly. 08822 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 08823 MVT VT, const X86Subtarget *Subtarget, 08824 SelectionDAG &DAG) { 08825 switch (VT.SimpleTy) { 08826 case MVT::v2i64: 08827 return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 08828 case MVT::v2f64: 08829 return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 08830 case MVT::v4i32: 08831 return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); 08832 case MVT::v4f32: 08833 return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); 08834 case MVT::v8i16: 08835 return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); 08836 case MVT::v16i8: 08837 return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); 08838 08839 default: 08840 llvm_unreachable("Unimplemented!"); 08841 } 08842 } 08843 08844 static bool isHalfCrossingShuffleMask(ArrayRef<int> Mask) { 08845 int Size = Mask.size(); 08846 for (int M : Mask.slice(0, Size / 2)) 08847 if (M >= 0 && (M % Size) >= Size / 2) 08848 return true; 08849 for (int M : Mask.slice(Size / 2, Size / 2)) 08850 if (M >= 0 && (M % Size) < Size / 2) 08851 return true; 08852 return false; 08853 } 08854 08855 /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit 08856 /// shuffles. 08857 /// 08858 /// There is a severely limited set of shuffles available in AVX1 for 256-bit 08859 /// vectors resulting in routinely needing to split the shuffle into two 128-bit 08860 /// shuffles. This can be done generically for any 256-bit vector shuffle and so 08861 /// we encode the logic here for specific shuffle lowering routines to bail to 08862 /// when they exhaust the features avaible to more directly handle the shuffle. 08863 static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, 08864 SDValue V2, 08865 const X86Subtarget *Subtarget, 08866 SelectionDAG &DAG) { 08867 SDLoc DL(Op); 08868 MVT VT = Op.getSimpleValueType(); 08869 assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); 08870 assert(V1.getSimpleValueType() == VT && "Bad operand type!"); 08871 assert(V2.getSimpleValueType() == VT && "Bad operand type!"); 08872 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 08873 ArrayRef<int> Mask = SVOp->getMask(); 08874 08875 ArrayRef<int> LoMask = Mask.slice(0, Mask.size()/2); 08876 ArrayRef<int> HiMask = Mask.slice(Mask.size()/2); 08877 08878 int NumElements = VT.getVectorNumElements(); 08879 int SplitNumElements = NumElements / 2; 08880 MVT ScalarVT = VT.getScalarType(); 08881 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); 08882 08883 SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, 08884 DAG.getIntPtrConstant(0)); 08885 SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, 08886 DAG.getIntPtrConstant(SplitNumElements)); 08887 SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, 08888 DAG.getIntPtrConstant(0)); 08889 SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, 08890 DAG.getIntPtrConstant(SplitNumElements)); 08891 08892 // Now create two 4-way blends of these half-width vectors. 08893 auto HalfBlend = [&](ArrayRef<int> HalfMask) { 08894 SmallVector<int, 16> V1BlendMask, V2BlendMask, BlendMask; 08895 for (int i = 0; i < SplitNumElements; ++i) { 08896 int M = HalfMask[i]; 08897 if (M >= NumElements) { 08898 V2BlendMask.push_back(M - NumElements); 08899 V1BlendMask.push_back(-1); 08900 BlendMask.push_back(SplitNumElements + i); 08901 } else if (M >= 0) { 08902 V2BlendMask.push_back(-1); 08903 V1BlendMask.push_back(M); 08904 BlendMask.push_back(i); 08905 } else { 08906 V2BlendMask.push_back(-1); 08907 V1BlendMask.push_back(-1); 08908 BlendMask.push_back(-1); 08909 } 08910 } 08911 SDValue V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); 08912 SDValue V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); 08913 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); 08914 }; 08915 SDValue Lo = HalfBlend(LoMask); 08916 SDValue Hi = HalfBlend(HiMask); 08917 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); 08918 } 08919 08920 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. 08921 /// 08922 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 08923 /// isn't available. 08924 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 08925 const X86Subtarget *Subtarget, 08926 SelectionDAG &DAG) { 08927 SDLoc DL(Op); 08928 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 08929 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); 08930 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 08931 ArrayRef<int> Mask = SVOp->getMask(); 08932 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 08933 08934 // FIXME: If we have AVX2, we should delegate to generic code as crossing 08935 // shuffles aren't a problem and FP and int have the same patterns. 08936 08937 // FIXME: We can handle these more cleverly than splitting for v4f64. 08938 if (isHalfCrossingShuffleMask(Mask)) 08939 return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); 08940 08941 if (isSingleInputShuffleMask(Mask)) { 08942 // Non-half-crossing single input shuffles can be lowerid with an 08943 // interleaved permutation. 08944 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | 08945 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); 08946 return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1, 08947 DAG.getConstant(VPERMILPMask, MVT::i8)); 08948 } 08949 08950 // X86 has dedicated unpack instructions that can handle specific blend 08951 // operations: UNPCKH and UNPCKL. 08952 if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) 08953 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); 08954 if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) 08955 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); 08956 // FIXME: It would be nice to find a way to get canonicalization to commute 08957 // these patterns. 08958 if (isShuffleEquivalent(Mask, 4, 0, 6, 2)) 08959 return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); 08960 if (isShuffleEquivalent(Mask, 5, 1, 7, 3)) 08961 return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); 08962 08963 // Check if the blend happens to exactly fit that of SHUFPD. 08964 if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) && 08965 Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) { 08966 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | 08967 ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); 08968 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, 08969 DAG.getConstant(SHUFPDMask, MVT::i8)); 08970 } 08971 if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 && 08972 (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) { 08973 unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | 08974 ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); 08975 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, 08976 DAG.getConstant(SHUFPDMask, MVT::i8)); 08977 } 08978 08979 // Shuffle the input elements into the desired positions in V1 and V2 and 08980 // blend them together. 08981 int V1Mask[] = {-1, -1, -1, -1}; 08982 int V2Mask[] = {-1, -1, -1, -1}; 08983 for (int i = 0; i < 4; ++i) 08984 if (Mask[i] >= 0 && Mask[i] < 4) 08985 V1Mask[i] = Mask[i]; 08986 else if (Mask[i] >= 4) 08987 V2Mask[i] = Mask[i] - 4; 08988 08989 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask); 08990 V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask); 08991 08992 unsigned BlendMask = 0; 08993 for (int i = 0; i < 4; ++i) 08994 if (Mask[i] >= 4) 08995 BlendMask |= 1 << i; 08996 08997 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2, 08998 DAG.getConstant(BlendMask, MVT::i8)); 08999 } 09000 09001 /// \brief Handle lowering of 4-lane 64-bit integer shuffles. 09002 /// 09003 /// Largely delegates to common code when we have AVX2 and to the floating-point 09004 /// code when we only have AVX. 09005 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 09006 const X86Subtarget *Subtarget, 09007 SelectionDAG &DAG) { 09008 SDLoc DL(Op); 09009 assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!"); 09010 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 09011 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); 09012 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 09013 ArrayRef<int> Mask = SVOp->getMask(); 09014 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); 09015 09016 // FIXME: If we have AVX2, we should delegate to generic code as crossing 09017 // shuffles aren't a problem and FP and int have the same patterns. 09018 09019 if (isHalfCrossingShuffleMask(Mask)) 09020 return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); 09021 09022 // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and 09023 // delegate to floating point code. 09024 V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1); 09025 V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2); 09026 return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, 09027 lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG)); 09028 } 09029 09030 /// \brief High-level routine to lower various 256-bit x86 vector shuffles. 09031 /// 09032 /// This routine either breaks down the specific type of a 256-bit x86 vector 09033 /// shuffle or splits it into two 128-bit shuffles and fuses the results back 09034 /// together based on the available instructions. 09035 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, 09036 MVT VT, const X86Subtarget *Subtarget, 09037 SelectionDAG &DAG) { 09038 switch (VT.SimpleTy) { 09039 case MVT::v4f64: 09040 return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); 09041 case MVT::v4i64: 09042 return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); 09043 case MVT::v8i32: 09044 case MVT::v8f32: 09045 case MVT::v16i16: 09046 case MVT::v32i8: 09047 // Fall back to the basic pattern of extracting the high half and forming 09048 // a 4-way blend. 09049 // FIXME: Add targeted lowering for each type that can document rationale 09050 // for delegating to this when necessary. 09051 return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); 09052 09053 default: 09054 llvm_unreachable("Not a valid 256-bit x86 vector type!"); 09055 } 09056 } 09057 09058 /// \brief Tiny helper function to test whether a shuffle mask could be 09059 /// simplified by widening the elements being shuffled. 09060 static bool canWidenShuffleElements(ArrayRef<int> Mask) { 09061 for (int i = 0, Size = Mask.size(); i < Size; i += 2) 09062 if ((Mask[i] != -1 && Mask[i] % 2 != 0) || 09063 (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 || 09064 (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1])))) 09065 return false; 09066 09067 return true; 09068 } 09069 09070 /// \brief Top-level lowering for x86 vector shuffles. 09071 /// 09072 /// This handles decomposition, canonicalization, and lowering of all x86 09073 /// vector shuffles. Most of the specific lowering strategies are encapsulated 09074 /// above in helper routines. The canonicalization attempts to widen shuffles 09075 /// to involve fewer lanes of wider elements, consolidate symmetric patterns 09076 /// s.t. only one of the two inputs needs to be tested, etc. 09077 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 09078 SelectionDAG &DAG) { 09079 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 09080 ArrayRef<int> Mask = SVOp->getMask(); 09081 SDValue V1 = Op.getOperand(0); 09082 SDValue V2 = Op.getOperand(1); 09083 MVT VT = Op.getSimpleValueType(); 09084 int NumElements = VT.getVectorNumElements(); 09085 SDLoc dl(Op); 09086 09087 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 09088 09089 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 09090 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 09091 if (V1IsUndef && V2IsUndef) 09092 return DAG.getUNDEF(VT); 09093 09094 // When we create a shuffle node we put the UNDEF node to second operand, 09095 // but in some cases the first operand may be transformed to UNDEF. 09096 // In this case we should just commute the node. 09097 if (V1IsUndef) 09098 return DAG.getCommutedVectorShuffle(*SVOp); 09099 09100 // Check for non-undef masks pointing at an undef vector and make the masks 09101 // undef as well. This makes it easier to match the shuffle based solely on 09102 // the mask. 09103 if (V2IsUndef) 09104 for (int M : Mask) 09105 if (M >= NumElements) { 09106 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); 09107 for (int &M : NewMask) 09108 if (M >= NumElements) 09109 M = -1; 09110 return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); 09111 } 09112 09113 // For integer vector shuffles, try to collapse them into a shuffle of fewer 09114 // lanes but wider integers. We cap this to not form integers larger than i64 09115 // but it might be interesting to form i128 integers to handle flipping the 09116 // low and high halves of AVX 256-bit vectors. 09117 if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && 09118 canWidenShuffleElements(Mask)) { 09119 SmallVector<int, 8> NewMask; 09120 for (int i = 0, Size = Mask.size(); i < Size; i += 2) 09121 NewMask.push_back(Mask[i] != -1 09122 ? Mask[i] / 2 09123 : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1)); 09124 MVT NewVT = 09125 MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), 09126 VT.getVectorNumElements() / 2); 09127 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 09128 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 09129 return DAG.getNode(ISD::BITCAST, dl, VT, 09130 DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); 09131 } 09132 09133 int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; 09134 for (int M : SVOp->getMask()) 09135 if (M < 0) 09136 ++NumUndefElements; 09137 else if (M < NumElements) 09138 ++NumV1Elements; 09139 else 09140 ++NumV2Elements; 09141 09142 // Commute the shuffle as needed such that more elements come from V1 than 09143 // V2. This allows us to match the shuffle pattern strictly on how many 09144 // elements come from V1 without handling the symmetric cases. 09145 if (NumV2Elements > NumV1Elements) 09146 return DAG.getCommutedVectorShuffle(*SVOp); 09147 09148 // When the number of V1 and V2 elements are the same, try to minimize the 09149 // number of uses of V2 in the low half of the vector. 09150 if (NumV1Elements == NumV2Elements) { 09151 int LowV1Elements = 0, LowV2Elements = 0; 09152 for (int M : SVOp->getMask().slice(0, NumElements / 2)) 09153 if (M >= NumElements) 09154 ++LowV2Elements; 09155 else if (M >= 0) 09156 ++LowV1Elements; 09157 if (LowV2Elements > LowV1Elements) 09158 return DAG.getCommutedVectorShuffle(*SVOp); 09159 } 09160 09161 // For each vector width, delegate to a specialized lowering routine. 09162 if (VT.getSizeInBits() == 128) 09163 return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 09164 09165 if (VT.getSizeInBits() == 256) 09166 return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); 09167 09168 llvm_unreachable("Unimplemented!"); 09169 } 09170 09171 09172 //===----------------------------------------------------------------------===// 09173 // Legacy vector shuffle lowering 09174 // 09175 // This code is the legacy code handling vector shuffles until the above 09176 // replaces its functionality and performance. 09177 //===----------------------------------------------------------------------===// 09178 09179 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41, 09180 bool hasInt256, unsigned *MaskOut = nullptr) { 09181 MVT EltVT = VT.getVectorElementType(); 09182 09183 // There is no blend with immediate in AVX-512. 09184 if (VT.is512BitVector()) 09185 return false; 09186 09187 if (!hasSSE41 || EltVT == MVT::i8) 09188 return false; 09189 if (!hasInt256 && VT == MVT::v16i16) 09190 return false; 09191 09192 unsigned MaskValue = 0; 09193 unsigned NumElems = VT.getVectorNumElements(); 09194 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 09195 unsigned NumLanes = (NumElems - 1) / 8 + 1; 09196 unsigned NumElemsInLane = NumElems / NumLanes; 09197 09198 // Blend for v16i16 should be symetric for the both lanes. 09199 for (unsigned i = 0; i < NumElemsInLane; ++i) { 09200 09201 int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; 09202 int EltIdx = MaskVals[i]; 09203 09204 if ((EltIdx < 0 || EltIdx == (int)i) && 09205 (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) 09206 continue; 09207 09208 if (((unsigned)EltIdx == (i + NumElems)) && 09209 (SndLaneEltIdx < 0 || 09210 (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) 09211 MaskValue |= (1 << i); 09212 else 09213 return false; 09214 } 09215 09216 if (MaskOut) 09217 *MaskOut = MaskValue; 09218 return true; 09219 } 09220 09221 // Try to lower a shuffle node into a simple blend instruction. 09222 // This function assumes isBlendMask returns true for this 09223 // SuffleVectorSDNode 09224 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, 09225 unsigned MaskValue, 09226 const X86Subtarget *Subtarget, 09227 SelectionDAG &DAG) { 09228 MVT VT = SVOp->getSimpleValueType(0); 09229 MVT EltVT = VT.getVectorElementType(); 09230 assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), 09231 Subtarget->hasInt256() && "Trying to lower a " 09232 "VECTOR_SHUFFLE to a Blend but " 09233 "with the wrong mask")); 09234 SDValue V1 = SVOp->getOperand(0); 09235 SDValue V2 = SVOp->getOperand(1); 09236 SDLoc dl(SVOp); 09237 unsigned NumElems = VT.getVectorNumElements(); 09238 09239 // Convert i32 vectors to floating point if it is not AVX2. 09240 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 09241 MVT BlendVT = VT; 09242 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 09243 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), 09244 NumElems); 09245 V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); 09246 V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); 09247 } 09248 09249 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, 09250 DAG.getConstant(MaskValue, MVT::i32)); 09251 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 09252 } 09253 09254 /// In vector type \p VT, return true if the element at index \p InputIdx 09255 /// falls on a different 128-bit lane than \p OutputIdx. 09256 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, 09257 unsigned OutputIdx) { 09258 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 09259 return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; 09260 } 09261 09262 /// Generate a PSHUFB if possible. Selects elements from \p V1 according to 09263 /// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to 09264 /// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p 09265 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a 09266 /// zero. 09267 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl, 09268 SelectionDAG &DAG) { 09269 MVT VT = V1.getSimpleValueType(); 09270 assert(VT.is128BitVector() || VT.is256BitVector()); 09271 09272 MVT EltVT = VT.getVectorElementType(); 09273 unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; 09274 unsigned NumElts = VT.getVectorNumElements(); 09275 09276 SmallVector<SDValue, 32> PshufbMask; 09277 for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { 09278 int InputIdx = MaskVals[OutputIdx]; 09279 unsigned InputByteIdx; 09280 09281 if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) 09282 InputByteIdx = 0x80; 09283 else { 09284 // Cross lane is not allowed. 09285 if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) 09286 return SDValue(); 09287 InputByteIdx = InputIdx * EltSizeInBytes; 09288 // Index is an byte offset within the 128-bit lane. 09289 InputByteIdx &= 0xf; 09290 } 09291 09292 for (unsigned j = 0; j < EltSizeInBytes; ++j) { 09293 PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); 09294 if (InputByteIdx != 0x80) 09295 ++InputByteIdx; 09296 } 09297 } 09298 09299 MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); 09300 if (ShufVT != VT) 09301 V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); 09302 return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, 09303 DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); 09304 } 09305 09306 // v8i16 shuffles - Prefer shuffles in the following order: 09307 // 1. [all] pshuflw, pshufhw, optional move 09308 // 2. [ssse3] 1 x pshufb 09309 // 3. [ssse3] 2 x pshufb + 1 x por 09310 // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 09311 static SDValue 09312 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, 09313 SelectionDAG &DAG) { 09314 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 09315 SDValue V1 = SVOp->getOperand(0); 09316 SDValue V2 = SVOp->getOperand(1); 09317 SDLoc dl(SVOp); 09318 SmallVector<int, 8> MaskVals; 09319 09320 // Determine if more than 1 of the words in each of the low and high quadwords 09321 // of the result come from the same quadword of one of the two inputs. Undef 09322 // mask values count as coming from any quadword, for better codegen. 09323 // 09324 // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input 09325 // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2. 09326 unsigned LoQuad[] = { 0, 0, 0, 0 }; 09327 unsigned HiQuad[] = { 0, 0, 0, 0 }; 09328 // Indices of quads used. 09329 std::bitset<4> InputQuads; 09330 for (unsigned i = 0; i < 8; ++i) { 09331 unsigned *Quad = i < 4 ? LoQuad : HiQuad; 09332 int EltIdx = SVOp->getMaskElt(i); 09333 MaskVals.push_back(EltIdx); 09334 if (EltIdx < 0) { 09335 ++Quad[0]; 09336 ++Quad[1]; 09337 ++Quad[2]; 09338 ++Quad[3]; 09339 continue; 09340 } 09341 ++Quad[EltIdx / 4]; 09342 InputQuads.set(EltIdx / 4); 09343 } 09344 09345 int BestLoQuad = -1; 09346 unsigned MaxQuad = 1; 09347 for (unsigned i = 0; i < 4; ++i) { 09348 if (LoQuad[i] > MaxQuad) { 09349 BestLoQuad = i; 09350 MaxQuad = LoQuad[i]; 09351 } 09352 } 09353 09354 int BestHiQuad = -1; 09355 MaxQuad = 1; 09356 for (unsigned i = 0; i < 4; ++i) { 09357 if (HiQuad[i] > MaxQuad) { 09358 BestHiQuad = i; 09359 MaxQuad = HiQuad[i]; 09360 } 09361 } 09362 09363 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 09364 // of the two input vectors, shuffle them into one input vector so only a 09365 // single pshufb instruction is necessary. If there are more than 2 input 09366 // quads, disable the next transformation since it does not help SSSE3. 09367 bool V1Used = InputQuads[0] || InputQuads[1]; 09368 bool V2Used = InputQuads[2] || InputQuads[3]; 09369 if (Subtarget->hasSSSE3()) { 09370 if (InputQuads.count() == 2 && V1Used && V2Used) { 09371 BestLoQuad = InputQuads[0] ? 0 : 1; 09372 BestHiQuad = InputQuads[2] ? 2 : 3; 09373 } 09374 if (InputQuads.count() > 2) { 09375 BestLoQuad = -1; 09376 BestHiQuad = -1; 09377 } 09378 } 09379 09380 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 09381 // the shuffle mask. If a quad is scored as -1, that means that it contains 09382 // words from all 4 input quadwords. 09383 SDValue NewV; 09384 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 09385 int MaskV[] = { 09386 BestLoQuad < 0 ? 0 : BestLoQuad, 09387 BestHiQuad < 0 ? 1 : BestHiQuad 09388 }; 09389 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 09390 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 09391 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 09392 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 09393 09394 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 09395 // source words for the shuffle, to aid later transformations. 09396 bool AllWordsInNewV = true; 09397 bool InOrder[2] = { true, true }; 09398 for (unsigned i = 0; i != 8; ++i) { 09399 int idx = MaskVals[i]; 09400 if (idx != (int)i) 09401 InOrder[i/4] = false; 09402 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 09403 continue; 09404 AllWordsInNewV = false; 09405 break; 09406 } 09407 09408 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 09409 if (AllWordsInNewV) { 09410 for (int i = 0; i != 8; ++i) { 09411 int idx = MaskVals[i]; 09412 if (idx < 0) 09413 continue; 09414 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 09415 if ((idx != i) && idx < 4) 09416 pshufhw = false; 09417 if ((idx != i) && idx > 3) 09418 pshuflw = false; 09419 } 09420 V1 = NewV; 09421 V2Used = false; 09422 BestLoQuad = 0; 09423 BestHiQuad = 1; 09424 } 09425 09426 // If we've eliminated the use of V2, and the new mask is a pshuflw or 09427 // pshufhw, that's as cheap as it gets. Return the new shuffle. 09428 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 09429 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 09430 unsigned TargetMask = 0; 09431 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 09432 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 09433 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 09434 TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): 09435 getShufflePSHUFLWImmediate(SVOp); 09436 V1 = NewV.getOperand(0); 09437 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 09438 } 09439 } 09440 09441 // Promote splats to a larger type which usually leads to more efficient code. 09442 // FIXME: Is this true if pshufb is available? 09443 if (SVOp->isSplat()) 09444 return PromoteSplat(SVOp, DAG); 09445 09446 // If we have SSSE3, and all words of the result are from 1 input vector, 09447 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 09448 // is present, fall back to case 4. 09449 if (Subtarget->hasSSSE3()) { 09450 SmallVector<SDValue,16> pshufbMask; 09451 09452 // If we have elements from both input vectors, set the high bit of the 09453 // shuffle mask element to zero out elements that come from V2 in the V1 09454 // mask, and elements that come from V1 in the V2 mask, so that the two 09455 // results can be OR'd together. 09456 bool TwoInputs = V1Used && V2Used; 09457 V1 = getPSHUFB(MaskVals, V1, dl, DAG); 09458 if (!TwoInputs) 09459 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 09460 09461 // Calculate the shuffle mask for the second input, shuffle it, and 09462 // OR it with the first shuffled input. 09463 CommuteVectorShuffleMask(MaskVals, 8); 09464 V2 = getPSHUFB(MaskVals, V2, dl, DAG); 09465 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 09466 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 09467 } 09468 09469 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 09470 // and update MaskVals with new element order. 09471 std::bitset<8> InOrder; 09472 if (BestLoQuad >= 0) { 09473 int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; 09474 for (int i = 0; i != 4; ++i) { 09475 int idx = MaskVals[i]; 09476 if (idx < 0) { 09477 InOrder.set(i); 09478 } else if ((idx / 4) == BestLoQuad) { 09479 MaskV[i] = idx & 3; 09480 InOrder.set(i); 09481 } 09482 } 09483 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 09484 &MaskV[0]); 09485 09486 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { 09487 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 09488 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 09489 NewV.getOperand(0), 09490 getShufflePSHUFLWImmediate(SVOp), DAG); 09491 } 09492 } 09493 09494 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 09495 // and update MaskVals with the new element order. 09496 if (BestHiQuad >= 0) { 09497 int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; 09498 for (unsigned i = 4; i != 8; ++i) { 09499 int idx = MaskVals[i]; 09500 if (idx < 0) { 09501 InOrder.set(i); 09502 } else if ((idx / 4) == BestHiQuad) { 09503 MaskV[i] = (idx & 3) + 4; 09504 InOrder.set(i); 09505 } 09506 } 09507 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 09508 &MaskV[0]); 09509 09510 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { 09511 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); 09512 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 09513 NewV.getOperand(0), 09514 getShufflePSHUFHWImmediate(SVOp), DAG); 09515 } 09516 } 09517 09518 // In case BestHi & BestLo were both -1, which means each quadword has a word 09519 // from each of the four input quadwords, calculate the InOrder bitvector now 09520 // before falling through to the insert/extract cleanup. 09521 if (BestLoQuad == -1 && BestHiQuad == -1) { 09522 NewV = V1; 09523 for (int i = 0; i != 8; ++i) 09524 if (MaskVals[i] < 0 || MaskVals[i] == i) 09525 InOrder.set(i); 09526 } 09527 09528 // The other elements are put in the right place using pextrw and pinsrw. 09529 for (unsigned i = 0; i != 8; ++i) { 09530 if (InOrder[i]) 09531 continue; 09532 int EltIdx = MaskVals[i]; 09533 if (EltIdx < 0) 09534 continue; 09535 SDValue ExtOp = (EltIdx < 8) ? 09536 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 09537 DAG.getIntPtrConstant(EltIdx)) : 09538 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 09539 DAG.getIntPtrConstant(EltIdx - 8)); 09540 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 09541 DAG.getIntPtrConstant(i)); 09542 } 09543 return NewV; 09544 } 09545 09546 /// \brief v16i16 shuffles 09547 /// 09548 /// FIXME: We only support generation of a single pshufb currently. We can 09549 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as 09550 /// well (e.g 2 x pshufb + 1 x por). 09551 static SDValue 09552 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) { 09553 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 09554 SDValue V1 = SVOp->getOperand(0); 09555 SDValue V2 = SVOp->getOperand(1); 09556 SDLoc dl(SVOp); 09557 09558 if (V2.getOpcode() != ISD::UNDEF) 09559 return SDValue(); 09560 09561 SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 09562 return getPSHUFB(MaskVals, V1, dl, DAG); 09563 } 09564 09565 // v16i8 shuffles - Prefer shuffles in the following order: 09566 // 1. [ssse3] 1 x pshufb 09567 // 2. [ssse3] 2 x pshufb + 1 x por 09568 // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 09569 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 09570 const X86Subtarget* Subtarget, 09571 SelectionDAG &DAG) { 09572 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 09573 SDValue V1 = SVOp->getOperand(0); 09574 SDValue V2 = SVOp->getOperand(1); 09575 SDLoc dl(SVOp); 09576 ArrayRef<int> MaskVals = SVOp->getMask(); 09577 09578 // Promote splats to a larger type which usually leads to more efficient code. 09579 // FIXME: Is this true if pshufb is available? 09580 if (SVOp->isSplat()) 09581 return PromoteSplat(SVOp, DAG); 09582 09583 // If we have SSSE3, case 1 is generated when all result bytes come from 09584 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 09585 // present, fall back to case 3. 09586 09587 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 09588 if (Subtarget->hasSSSE3()) { 09589 SmallVector<SDValue,16> pshufbMask; 09590 09591 // If all result elements are from one input vector, then only translate 09592 // undef mask values to 0x80 (zero out result) in the pshufb mask. 09593 // 09594 // Otherwise, we have elements from both input vectors, and must zero out 09595 // elements that come from V2 in the first mask, and V1 in the second mask 09596 // so that we can OR them together. 09597 for (unsigned i = 0; i != 16; ++i) { 09598 int EltIdx = MaskVals[i]; 09599 if (EltIdx < 0 || EltIdx >= 16) 09600 EltIdx = 0x80; 09601 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 09602 } 09603 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 09604 DAG.getNode(ISD::BUILD_VECTOR, dl, 09605 MVT::v16i8, pshufbMask)); 09606 09607 // As PSHUFB will zero elements with negative indices, it's safe to ignore 09608 // the 2nd operand if it's undefined or zero. 09609 if (V2.getOpcode() == ISD::UNDEF || 09610 ISD::isBuildVectorAllZeros(V2.getNode())) 09611 return V1; 09612 09613 // Calculate the shuffle mask for the second input, shuffle it, and 09614 // OR it with the first shuffled input. 09615 pshufbMask.clear(); 09616 for (unsigned i = 0; i != 16; ++i) { 09617 int EltIdx = MaskVals[i]; 09618 EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; 09619 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 09620 } 09621 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 09622 DAG.getNode(ISD::BUILD_VECTOR, dl, 09623 MVT::v16i8, pshufbMask)); 09624 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 09625 } 09626 09627 // No SSSE3 - Calculate in place words and then fix all out of place words 09628 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 09629 // the 16 different words that comprise the two doublequadword input vectors. 09630 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 09631 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 09632 SDValue NewV = V1; 09633 for (int i = 0; i != 8; ++i) { 09634 int Elt0 = MaskVals[i*2]; 09635 int Elt1 = MaskVals[i*2+1]; 09636 09637 // This word of the result is all undef, skip it. 09638 if (Elt0 < 0 && Elt1 < 0) 09639 continue; 09640 09641 // This word of the result is already in the correct place, skip it. 09642 if ((Elt0 == i*2) && (Elt1 == i*2+1)) 09643 continue; 09644 09645 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 09646 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 09647 SDValue InsElt; 09648 09649 // If Elt0 and Elt1 are defined, are consecutive, and can be load 09650 // using a single extract together, load it and store it. 09651 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 09652 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 09653 DAG.getIntPtrConstant(Elt1 / 2)); 09654 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 09655 DAG.getIntPtrConstant(i)); 09656 continue; 09657 } 09658 09659 // If Elt1 is defined, extract it from the appropriate source. If the 09660 // source byte is not also odd, shift the extracted word left 8 bits 09661 // otherwise clear the bottom 8 bits if we need to do an or. 09662 if (Elt1 >= 0) { 09663 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 09664 DAG.getIntPtrConstant(Elt1 / 2)); 09665 if ((Elt1 & 1) == 0) 09666 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 09667 DAG.getConstant(8, 09668 TLI.getShiftAmountTy(InsElt.getValueType()))); 09669 else if (Elt0 >= 0) 09670 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 09671 DAG.getConstant(0xFF00, MVT::i16)); 09672 } 09673 // If Elt0 is defined, extract it from the appropriate source. If the 09674 // source byte is not also even, shift the extracted word right 8 bits. If 09675 // Elt1 was also defined, OR the extracted values together before 09676 // inserting them in the result. 09677 if (Elt0 >= 0) { 09678 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 09679 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 09680 if ((Elt0 & 1) != 0) 09681 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 09682 DAG.getConstant(8, 09683 TLI.getShiftAmountTy(InsElt0.getValueType()))); 09684 else if (Elt1 >= 0) 09685 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 09686 DAG.getConstant(0x00FF, MVT::i16)); 09687 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 09688 : InsElt0; 09689 } 09690 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 09691 DAG.getIntPtrConstant(i)); 09692 } 09693 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 09694 } 09695 09696 // v32i8 shuffles - Translate to VPSHUFB if possible. 09697 static 09698 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, 09699 const X86Subtarget *Subtarget, 09700 SelectionDAG &DAG) { 09701 MVT VT = SVOp->getSimpleValueType(0); 09702 SDValue V1 = SVOp->getOperand(0); 09703 SDValue V2 = SVOp->getOperand(1); 09704 SDLoc dl(SVOp); 09705 SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); 09706 09707 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 09708 bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); 09709 bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); 09710 09711 // VPSHUFB may be generated if 09712 // (1) one of input vector is undefined or zeroinitializer. 09713 // The mask value 0x80 puts 0 in the corresponding slot of the vector. 09714 // And (2) the mask indexes don't cross the 128-bit lane. 09715 if (VT != MVT::v32i8 || !Subtarget->hasInt256() || 09716 (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) 09717 return SDValue(); 09718 09719 if (V1IsAllZero && !V2IsAllZero) { 09720 CommuteVectorShuffleMask(MaskVals, 32); 09721 V1 = V2; 09722 } 09723 return getPSHUFB(MaskVals, V1, dl, DAG); 09724 } 09725 09726 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 09727 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 09728 /// done when every pair / quad of shuffle mask elements point to elements in 09729 /// the right sequence. e.g. 09730 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 09731 static 09732 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 09733 SelectionDAG &DAG) { 09734 MVT VT = SVOp->getSimpleValueType(0); 09735 SDLoc dl(SVOp); 09736 unsigned NumElems = VT.getVectorNumElements(); 09737 MVT NewVT; 09738 unsigned Scale; 09739 switch (VT.SimpleTy) { 09740 default: llvm_unreachable("Unexpected!"); 09741 case MVT::v2i64: 09742 case MVT::v2f64: 09743 return SDValue(SVOp, 0); 09744 case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; 09745 case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; 09746 case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; 09747 case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; 09748 case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; 09749 case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; 09750 } 09751 09752 SmallVector<int, 8> MaskVec; 09753 for (unsigned i = 0; i != NumElems; i += Scale) { 09754 int StartIdx = -1; 09755 for (unsigned j = 0; j != Scale; ++j) { 09756 int EltIdx = SVOp->getMaskElt(i+j); 09757 if (EltIdx < 0) 09758 continue; 09759 if (StartIdx < 0) 09760 StartIdx = (EltIdx / Scale); 09761 if (EltIdx != (int)(StartIdx*Scale + j)) 09762 return SDValue(); 09763 } 09764 MaskVec.push_back(StartIdx); 09765 } 09766 09767 SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); 09768 SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); 09769 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 09770 } 09771 09772 /// getVZextMovL - Return a zero-extending vector move low node. 09773 /// 09774 static SDValue getVZextMovL(MVT VT, MVT OpVT, 09775 SDValue SrcOp, SelectionDAG &DAG, 09776 const X86Subtarget *Subtarget, SDLoc dl) { 09777 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 09778 LoadSDNode *LD = nullptr; 09779 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 09780 LD = dyn_cast<LoadSDNode>(SrcOp); 09781 if (!LD) { 09782 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 09783 // instead. 09784 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 09785 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 09786 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 09787 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 09788 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 09789 // PR2108 09790 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 09791 return DAG.getNode(ISD::BITCAST, dl, VT, 09792 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 09793 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 09794 OpVT, 09795 SrcOp.getOperand(0) 09796 .getOperand(0)))); 09797 } 09798 } 09799 } 09800 09801 return DAG.getNode(ISD::BITCAST, dl, VT, 09802 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 09803 DAG.getNode(ISD::BITCAST, dl, 09804 OpVT, SrcOp))); 09805 } 09806 09807 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 09808 /// which could not be matched by any known target speficic shuffle 09809 static SDValue 09810 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 09811 09812 SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); 09813 if (NewOp.getNode()) 09814 return NewOp; 09815 09816 MVT VT = SVOp->getSimpleValueType(0); 09817 09818 unsigned NumElems = VT.getVectorNumElements(); 09819 unsigned NumLaneElems = NumElems / 2; 09820 09821 SDLoc dl(SVOp); 09822 MVT EltVT = VT.getVectorElementType(); 09823 MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); 09824 SDValue Output[2]; 09825 09826 SmallVector<int, 16> Mask; 09827 for (unsigned l = 0; l < 2; ++l) { 09828 // Build a shuffle mask for the output, discovering on the fly which 09829 // input vectors to use as shuffle operands (recorded in InputUsed). 09830 // If building a suitable shuffle vector proves too hard, then bail 09831 // out with UseBuildVector set. 09832 bool UseBuildVector = false; 09833 int InputUsed[2] = { -1, -1 }; // Not yet discovered. 09834 unsigned LaneStart = l * NumLaneElems; 09835 for (unsigned i = 0; i != NumLaneElems; ++i) { 09836 // The mask element. This indexes into the input. 09837 int Idx = SVOp->getMaskElt(i+LaneStart); 09838 if (Idx < 0) { 09839 // the mask element does not index into any input vector. 09840 Mask.push_back(-1); 09841 continue; 09842 } 09843 09844 // The input vector this mask element indexes into. 09845 int Input = Idx / NumLaneElems; 09846 09847 // Turn the index into an offset from the start of the input vector. 09848 Idx -= Input * NumLaneElems; 09849 09850 // Find or create a shuffle vector operand to hold this input. 09851 unsigned OpNo; 09852 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 09853 if (InputUsed[OpNo] == Input) 09854 // This input vector is already an operand. 09855 break; 09856 if (InputUsed[OpNo] < 0) { 09857 // Create a new operand for this input vector. 09858 InputUsed[OpNo] = Input; 09859 break; 09860 } 09861 } 09862 09863 if (OpNo >= array_lengthof(InputUsed)) { 09864 // More than two input vectors used! Give up on trying to create a 09865 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 09866 UseBuildVector = true; 09867 break; 09868 } 09869 09870 // Add the mask index for the new shuffle vector. 09871 Mask.push_back(Idx + OpNo * NumLaneElems); 09872 } 09873 09874 if (UseBuildVector) { 09875 SmallVector<SDValue, 16> SVOps; 09876 for (unsigned i = 0; i != NumLaneElems; ++i) { 09877 // The mask element. This indexes into the input. 09878 int Idx = SVOp->getMaskElt(i+LaneStart); 09879 if (Idx < 0) { 09880 SVOps.push_back(DAG.getUNDEF(EltVT)); 09881 continue; 09882 } 09883 09884 // The input vector this mask element indexes into. 09885 int Input = Idx / NumElems; 09886 09887 // Turn the index into an offset from the start of the input vector. 09888 Idx -= Input * NumElems; 09889 09890 // Extract the vector element by hand. 09891 SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 09892 SVOp->getOperand(Input), 09893 DAG.getIntPtrConstant(Idx))); 09894 } 09895 09896 // Construct the output using a BUILD_VECTOR. 09897 Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); 09898 } else if (InputUsed[0] < 0) { 09899 // No input vectors were used! The result is undefined. 09900 Output[l] = DAG.getUNDEF(NVT); 09901 } else { 09902 SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), 09903 (InputUsed[0] % 2) * NumLaneElems, 09904 DAG, dl); 09905 // If only one input was used, use an undefined vector for the other. 09906 SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : 09907 Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), 09908 (InputUsed[1] % 2) * NumLaneElems, DAG, dl); 09909 // At least one input vector was used. Create a new shuffle vector. 09910 Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); 09911 } 09912 09913 Mask.clear(); 09914 } 09915 09916 // Concatenate the result back 09917 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); 09918 } 09919 09920 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 09921 /// 4 elements, and match them with several different shuffle types. 09922 static SDValue 09923 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 09924 SDValue V1 = SVOp->getOperand(0); 09925 SDValue V2 = SVOp->getOperand(1); 09926 SDLoc dl(SVOp); 09927 MVT VT = SVOp->getSimpleValueType(0); 09928 09929 assert(VT.is128BitVector() && "Unsupported vector size"); 09930 09931 std::pair<int, int> Locs[4]; 09932 int Mask1[] = { -1, -1, -1, -1 }; 09933 SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); 09934 09935 unsigned NumHi = 0; 09936 unsigned NumLo = 0; 09937 for (unsigned i = 0; i != 4; ++i) { 09938 int Idx = PermMask[i]; 09939 if (Idx < 0) { 09940 Locs[i] = std::make_pair(-1, -1); 09941 } else { 09942 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 09943 if (Idx < 4) { 09944 Locs[i] = std::make_pair(0, NumLo); 09945 Mask1[NumLo] = Idx; 09946 NumLo++; 09947 } else { 09948 Locs[i] = std::make_pair(1, NumHi); 09949 if (2+NumHi < 4) 09950 Mask1[2+NumHi] = Idx; 09951 NumHi++; 09952 } 09953 } 09954 } 09955 09956 if (NumLo <= 2 && NumHi <= 2) { 09957 // If no more than two elements come from either vector. This can be 09958 // implemented with two shuffles. First shuffle gather the elements. 09959 // The second shuffle, which takes the first shuffle as both of its 09960 // vector operands, put the elements into the right order. 09961 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 09962 09963 int Mask2[] = { -1, -1, -1, -1 }; 09964 09965 for (unsigned i = 0; i != 4; ++i) 09966 if (Locs[i].first != -1) { 09967 unsigned Idx = (i < 2) ? 0 : 4; 09968 Idx += Locs[i].first * 2 + Locs[i].second; 09969 Mask2[i] = Idx; 09970 } 09971 09972 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 09973 } 09974 09975 if (NumLo == 3 || NumHi == 3) { 09976 // Otherwise, we must have three elements from one vector, call it X, and 09977 // one element from the other, call it Y. First, use a shufps to build an 09978 // intermediate vector with the one element from Y and the element from X 09979 // that will be in the same half in the final destination (the indexes don't 09980 // matter). Then, use a shufps to build the final vector, taking the half 09981 // containing the element from Y from the intermediate, and the other half 09982 // from X. 09983 if (NumHi == 3) { 09984 // Normalize it so the 3 elements come from V1. 09985 CommuteVectorShuffleMask(PermMask, 4); 09986 std::swap(V1, V2); 09987 } 09988 09989 // Find the element from V2. 09990 unsigned HiIndex; 09991 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 09992 int Val = PermMask[HiIndex]; 09993 if (Val < 0) 09994 continue; 09995 if (Val >= 4) 09996 break; 09997 } 09998 09999 Mask1[0] = PermMask[HiIndex]; 10000 Mask1[1] = -1; 10001 Mask1[2] = PermMask[HiIndex^1]; 10002 Mask1[3] = -1; 10003 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 10004 10005 if (HiIndex >= 2) { 10006 Mask1[0] = PermMask[0]; 10007 Mask1[1] = PermMask[1]; 10008 Mask1[2] = HiIndex & 1 ? 6 : 4; 10009 Mask1[3] = HiIndex & 1 ? 4 : 6; 10010 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 10011 } 10012 10013 Mask1[0] = HiIndex & 1 ? 2 : 0; 10014 Mask1[1] = HiIndex & 1 ? 0 : 2; 10015 Mask1[2] = PermMask[2]; 10016 Mask1[3] = PermMask[3]; 10017 if (Mask1[2] >= 0) 10018 Mask1[2] += 4; 10019 if (Mask1[3] >= 0) 10020 Mask1[3] += 4; 10021 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 10022 } 10023 10024 // Break it into (shuffle shuffle_hi, shuffle_lo). 10025 int LoMask[] = { -1, -1, -1, -1 }; 10026 int HiMask[] = { -1, -1, -1, -1 }; 10027 10028 int *MaskPtr = LoMask; 10029 unsigned MaskIdx = 0; 10030 unsigned LoIdx = 0; 10031 unsigned HiIdx = 2; 10032 for (unsigned i = 0; i != 4; ++i) { 10033 if (i == 2) { 10034 MaskPtr = HiMask; 10035 MaskIdx = 1; 10036 LoIdx = 0; 10037 HiIdx = 2; 10038 } 10039 int Idx = PermMask[i]; 10040 if (Idx < 0) { 10041 Locs[i] = std::make_pair(-1, -1); 10042 } else if (Idx < 4) { 10043 Locs[i] = std::make_pair(MaskIdx, LoIdx); 10044 MaskPtr[LoIdx] = Idx; 10045 LoIdx++; 10046 } else { 10047 Locs[i] = std::make_pair(MaskIdx, HiIdx); 10048 MaskPtr[HiIdx] = Idx; 10049 HiIdx++; 10050 } 10051 } 10052 10053 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 10054 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 10055 int MaskOps[] = { -1, -1, -1, -1 }; 10056 for (unsigned i = 0; i != 4; ++i) 10057 if (Locs[i].first != -1) 10058 MaskOps[i] = Locs[i].first * 4 + Locs[i].second; 10059 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 10060 } 10061 10062 static bool MayFoldVectorLoad(SDValue V) { 10063 while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 10064 V = V.getOperand(0); 10065 10066 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 10067 V = V.getOperand(0); 10068 if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && 10069 V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) 10070 // BUILD_VECTOR (load), undef 10071 V = V.getOperand(0); 10072 10073 return MayFoldLoad(V); 10074 } 10075 10076 static 10077 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { 10078 MVT VT = Op.getSimpleValueType(); 10079 10080 // Canonizalize to v2f64. 10081 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 10082 return DAG.getNode(ISD::BITCAST, dl, VT, 10083 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 10084 V1, DAG)); 10085 } 10086 10087 static 10088 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, 10089 bool HasSSE2) { 10090 SDValue V1 = Op.getOperand(0); 10091 SDValue V2 = Op.getOperand(1); 10092 MVT VT = Op.getSimpleValueType(); 10093 10094 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 10095 10096 if (HasSSE2 && VT == MVT::v2f64) 10097 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 10098 10099 // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) 10100 return DAG.getNode(ISD::BITCAST, dl, VT, 10101 getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, 10102 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), 10103 DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); 10104 } 10105 10106 static 10107 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { 10108 SDValue V1 = Op.getOperand(0); 10109 SDValue V2 = Op.getOperand(1); 10110 MVT VT = Op.getSimpleValueType(); 10111 10112 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 10113 "unsupported shuffle type"); 10114 10115 if (V2.getOpcode() == ISD::UNDEF) 10116 V2 = V1; 10117 10118 // v4i32 or v4f32 10119 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 10120 } 10121 10122 static 10123 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 10124 SDValue V1 = Op.getOperand(0); 10125 SDValue V2 = Op.getOperand(1); 10126 MVT VT = Op.getSimpleValueType(); 10127 unsigned NumElems = VT.getVectorNumElements(); 10128 10129 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 10130 // operand of these instructions is only memory, so check if there's a 10131 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 10132 // same masks. 10133 bool CanFoldLoad = false; 10134 10135 // Trivial case, when V2 comes from a load. 10136 if (MayFoldVectorLoad(V2)) 10137 CanFoldLoad = true; 10138 10139 // When V1 is a load, it can be folded later into a store in isel, example: 10140 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 10141 // turns into: 10142 // (MOVLPSmr addr:$src1, VR128:$src2) 10143 // So, recognize this potential and also use MOVLPS or MOVLPD 10144 else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 10145 CanFoldLoad = true; 10146 10147 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10148 if (CanFoldLoad) { 10149 if (HasSSE2 && NumElems == 2) 10150 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 10151 10152 if (NumElems == 4) 10153 // If we don't care about the second element, proceed to use movss. 10154 if (SVOp->getMaskElt(1) != -1) 10155 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 10156 } 10157 10158 // movl and movlp will both match v2i64, but v2i64 is never matched by 10159 // movl earlier because we make it strict to avoid messing with the movlp load 10160 // folding logic (see the code above getMOVLP call). Match it here then, 10161 // this is horrible, but will stay like this until we move all shuffle 10162 // matching to x86 specific nodes. Note that for the 1st condition all 10163 // types are matched with movsd. 10164 if (HasSSE2) { 10165 // FIXME: isMOVLMask should be checked and matched before getMOVLP, 10166 // as to remove this logic from here, as much as possible 10167 if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) 10168 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 10169 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 10170 } 10171 10172 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 10173 10174 // Invert the operand order and use SHUFPS to match it. 10175 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, 10176 getShuffleSHUFImmediate(SVOp), DAG); 10177 } 10178 10179 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, 10180 SelectionDAG &DAG) { 10181 SDLoc dl(Load); 10182 MVT VT = Load->getSimpleValueType(0); 10183 MVT EVT = VT.getVectorElementType(); 10184 SDValue Addr = Load->getOperand(1); 10185 SDValue NewAddr = DAG.getNode( 10186 ISD::ADD, dl, Addr.getSimpleValueType(), Addr, 10187 DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); 10188 10189 SDValue NewLoad = 10190 DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, 10191 DAG.getMachineFunction().getMachineMemOperand( 10192 Load->getMemOperand(), 0, EVT.getStoreSize())); 10193 return NewLoad; 10194 } 10195 10196 // It is only safe to call this function if isINSERTPSMask is true for 10197 // this shufflevector mask. 10198 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, 10199 SelectionDAG &DAG) { 10200 // Generate an insertps instruction when inserting an f32 from memory onto a 10201 // v4f32 or when copying a member from one v4f32 to another. 10202 // We also use it for transferring i32 from one register to another, 10203 // since it simply copies the same bits. 10204 // If we're transferring an i32 from memory to a specific element in a 10205 // register, we output a generic DAG that will match the PINSRD 10206 // instruction. 10207 MVT VT = SVOp->getSimpleValueType(0); 10208 MVT EVT = VT.getVectorElementType(); 10209 SDValue V1 = SVOp->getOperand(0); 10210 SDValue V2 = SVOp->getOperand(1); 10211 auto Mask = SVOp->getMask(); 10212 assert((VT == MVT::v4f32 || VT == MVT::v4i32) && 10213 "unsupported vector type for insertps/pinsrd"); 10214 10215 auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; 10216 auto FromV2Predicate = [](const int &i) { return i >= 4; }; 10217 int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); 10218 10219 SDValue From; 10220 SDValue To; 10221 unsigned DestIndex; 10222 if (FromV1 == 1) { 10223 From = V1; 10224 To = V2; 10225 DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - 10226 Mask.begin(); 10227 10228 // If we have 1 element from each vector, we have to check if we're 10229 // changing V1's element's place. If so, we're done. Otherwise, we 10230 // should assume we're changing V2's element's place and behave 10231 // accordingly. 10232 int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); 10233 assert(DestIndex <= INT32_MAX && "truncated destination index"); 10234 if (FromV1 == FromV2 && 10235 static_cast<int>(DestIndex) == Mask[DestIndex] % 4) { 10236 From = V2; 10237 To = V1; 10238 DestIndex = 10239 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); 10240 } 10241 } else { 10242 assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && 10243 "More than one element from V1 and from V2, or no elements from one " 10244 "of the vectors. This case should not have returned true from " 10245 "isINSERTPSMask"); 10246 From = V2; 10247 To = V1; 10248 DestIndex = 10249 std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); 10250 } 10251 10252 // Get an index into the source vector in the range [0,4) (the mask is 10253 // in the range [0,8) because it can address V1 and V2) 10254 unsigned SrcIndex = Mask[DestIndex] % 4; 10255 if (MayFoldLoad(From)) { 10256 // Trivial case, when From comes from a load and is only used by the 10257 // shuffle. Make it use insertps from the vector that we need from that 10258 // load. 10259 SDValue NewLoad = 10260 NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG); 10261 if (!NewLoad.getNode()) 10262 return SDValue(); 10263 10264 if (EVT == MVT::f32) { 10265 // Create this as a scalar to vector to match the instruction pattern. 10266 SDValue LoadScalarToVector = 10267 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); 10268 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); 10269 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, 10270 InsertpsMask); 10271 } else { // EVT == MVT::i32 10272 // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT 10273 // instruction, to match the PINSRD instruction, which loads an i32 to a 10274 // certain vector element. 10275 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, 10276 DAG.getConstant(DestIndex, MVT::i32)); 10277 } 10278 } 10279 10280 // Vector-element-to-vector 10281 SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); 10282 return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); 10283 } 10284 10285 // Reduce a vector shuffle to zext. 10286 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, 10287 SelectionDAG &DAG) { 10288 // PMOVZX is only available from SSE41. 10289 if (!Subtarget->hasSSE41()) 10290 return SDValue(); 10291 10292 MVT VT = Op.getSimpleValueType(); 10293 10294 // Only AVX2 support 256-bit vector integer extending. 10295 if (!Subtarget->hasInt256() && VT.is256BitVector()) 10296 return SDValue(); 10297 10298 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10299 SDLoc DL(Op); 10300 SDValue V1 = Op.getOperand(0); 10301 SDValue V2 = Op.getOperand(1); 10302 unsigned NumElems = VT.getVectorNumElements(); 10303 10304 // Extending is an unary operation and the element type of the source vector 10305 // won't be equal to or larger than i64. 10306 if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || 10307 VT.getVectorElementType() == MVT::i64) 10308 return SDValue(); 10309 10310 // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. 10311 unsigned Shift = 1; // Start from 2, i.e. 1 << 1. 10312 while ((1U << Shift) < NumElems) { 10313 if (SVOp->getMaskElt(1U << Shift) == 1) 10314 break; 10315 Shift += 1; 10316 // The maximal ratio is 8, i.e. from i8 to i64. 10317 if (Shift > 3) 10318 return SDValue(); 10319 } 10320 10321 // Check the shuffle mask. 10322 unsigned Mask = (1U << Shift) - 1; 10323 for (unsigned i = 0; i != NumElems; ++i) { 10324 int EltIdx = SVOp->getMaskElt(i); 10325 if ((i & Mask) != 0 && EltIdx != -1) 10326 return SDValue(); 10327 if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) 10328 return SDValue(); 10329 } 10330 10331 unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; 10332 MVT NeVT = MVT::getIntegerVT(NBits); 10333 MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); 10334 10335 if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) 10336 return SDValue(); 10337 10338 // Simplify the operand as it's prepared to be fed into shuffle. 10339 unsigned SignificantBits = NVT.getSizeInBits() >> Shift; 10340 if (V1.getOpcode() == ISD::BITCAST && 10341 V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 10342 V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 10343 V1.getOperand(0).getOperand(0) 10344 .getSimpleValueType().getSizeInBits() == SignificantBits) { 10345 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) 10346 SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); 10347 ConstantSDNode *CIdx = 10348 dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); 10349 // If it's foldable, i.e. normal load with single use, we will let code 10350 // selection to fold it. Otherwise, we will short the conversion sequence. 10351 if (CIdx && CIdx->getZExtValue() == 0 && 10352 (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { 10353 MVT FullVT = V.getSimpleValueType(); 10354 MVT V1VT = V1.getSimpleValueType(); 10355 if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { 10356 // The "ext_vec_elt" node is wider than the result node. 10357 // In this case we should extract subvector from V. 10358 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). 10359 unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); 10360 MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), 10361 FullVT.getVectorNumElements()/Ratio); 10362 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 10363 DAG.getIntPtrConstant(0)); 10364 } 10365 V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); 10366 } 10367 } 10368 10369 return DAG.getNode(ISD::BITCAST, DL, VT, 10370 DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); 10371 } 10372 10373 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, 10374 SelectionDAG &DAG) { 10375 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10376 MVT VT = Op.getSimpleValueType(); 10377 SDLoc dl(Op); 10378 SDValue V1 = Op.getOperand(0); 10379 SDValue V2 = Op.getOperand(1); 10380 10381 if (isZeroShuffle(SVOp)) 10382 return getZeroVector(VT, Subtarget, DAG, dl); 10383 10384 // Handle splat operations 10385 if (SVOp->isSplat()) { 10386 // Use vbroadcast whenever the splat comes from a foldable load 10387 SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); 10388 if (Broadcast.getNode()) 10389 return Broadcast; 10390 } 10391 10392 // Check integer expanding shuffles. 10393 SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); 10394 if (NewOp.getNode()) 10395 return NewOp; 10396 10397 // If the shuffle can be profitably rewritten as a narrower shuffle, then 10398 // do it! 10399 if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || 10400 VT == MVT::v32i8) { 10401 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 10402 if (NewOp.getNode()) 10403 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 10404 } else if (VT.is128BitVector() && Subtarget->hasSSE2()) { 10405 // FIXME: Figure out a cleaner way to do this. 10406 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 10407 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 10408 if (NewOp.getNode()) { 10409 MVT NewVT = NewOp.getSimpleValueType(); 10410 if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), 10411 NewVT, true, false)) 10412 return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, 10413 dl); 10414 } 10415 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 10416 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); 10417 if (NewOp.getNode()) { 10418 MVT NewVT = NewOp.getSimpleValueType(); 10419 if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) 10420 return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, 10421 dl); 10422 } 10423 } 10424 } 10425 return SDValue(); 10426 } 10427 10428 SDValue 10429 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 10430 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 10431 SDValue V1 = Op.getOperand(0); 10432 SDValue V2 = Op.getOperand(1); 10433 MVT VT = Op.getSimpleValueType(); 10434 SDLoc dl(Op); 10435 unsigned NumElems = VT.getVectorNumElements(); 10436 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 10437 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 10438 bool V1IsSplat = false; 10439 bool V2IsSplat = false; 10440 bool HasSSE2 = Subtarget->hasSSE2(); 10441 bool HasFp256 = Subtarget->hasFp256(); 10442 bool HasInt256 = Subtarget->hasInt256(); 10443 MachineFunction &MF = DAG.getMachineFunction(); 10444 bool OptForSize = MF.getFunction()->getAttributes(). 10445 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); 10446 10447 // Check if we should use the experimental vector shuffle lowering. If so, 10448 // delegate completely to that code path. 10449 if (ExperimentalVectorShuffleLowering) 10450 return lowerVectorShuffle(Op, Subtarget, DAG); 10451 10452 assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); 10453 10454 if (V1IsUndef && V2IsUndef) 10455 return DAG.getUNDEF(VT); 10456 10457 // When we create a shuffle node we put the UNDEF node to second operand, 10458 // but in some cases the first operand may be transformed to UNDEF. 10459 // In this case we should just commute the node. 10460 if (V1IsUndef) 10461 return DAG.getCommutedVectorShuffle(*SVOp); 10462 10463 // Vector shuffle lowering takes 3 steps: 10464 // 10465 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 10466 // narrowing and commutation of operands should be handled. 10467 // 2) Matching of shuffles with known shuffle masks to x86 target specific 10468 // shuffle nodes. 10469 // 3) Rewriting of unmatched masks into new generic shuffle operations, 10470 // so the shuffle can be broken into other shuffles and the legalizer can 10471 // try the lowering again. 10472 // 10473 // The general idea is that no vector_shuffle operation should be left to 10474 // be matched during isel, all of them must be converted to a target specific 10475 // node here. 10476 10477 // Normalize the input vectors. Here splats, zeroed vectors, profitable 10478 // narrowing and commutation of operands should be handled. The actual code 10479 // doesn't include all of those, work in progress... 10480 SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); 10481 if (NewOp.getNode()) 10482 return NewOp; 10483 10484 SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); 10485 10486 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 10487 // unpckh_undef). Only use pshufd if speed is more important than size. 10488 if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 10489 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 10490 if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 10491 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 10492 10493 if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && 10494 V2IsUndef && MayFoldVectorLoad(V1)) 10495 return getMOVDDup(Op, dl, V1, DAG); 10496 10497 if (isMOVHLPS_v_undef_Mask(M, VT)) 10498 return getMOVHighToLow(Op, dl, DAG); 10499 10500 // Use to match splats 10501 if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && 10502 (VT == MVT::v2f64 || VT == MVT::v2i64)) 10503 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 10504 10505 if (isPSHUFDMask(M, VT)) { 10506 // The actual implementation will match the mask in the if above and then 10507 // during isel it can match several different instructions, not only pshufd 10508 // as its name says, sad but true, emulate the behavior for now... 10509 if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 10510 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 10511 10512 unsigned TargetMask = getShuffleSHUFImmediate(SVOp); 10513 10514 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 10515 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 10516 10517 if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) 10518 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, 10519 DAG); 10520 10521 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, 10522 TargetMask, DAG); 10523 } 10524 10525 if (isPALIGNRMask(M, VT, Subtarget)) 10526 return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, 10527 getShufflePALIGNRImmediate(SVOp), 10528 DAG); 10529 10530 if (isVALIGNMask(M, VT, Subtarget)) 10531 return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, 10532 getShuffleVALIGNImmediate(SVOp), 10533 DAG); 10534 10535 // Check if this can be converted into a logical shift. 10536 bool isLeft = false; 10537 unsigned ShAmt = 0; 10538 SDValue ShVal; 10539 bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 10540 if (isShift && ShVal.hasOneUse()) { 10541 // If the shifted value has multiple uses, it may be cheaper to use 10542 // v_set0 + movlhps or movhlps, etc. 10543 MVT EltVT = VT.getVectorElementType(); 10544 ShAmt *= EltVT.getSizeInBits(); 10545 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 10546 } 10547 10548 if (isMOVLMask(M, VT)) { 10549 if (ISD::isBuildVectorAllZeros(V1.getNode())) 10550 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 10551 if (!isMOVLPMask(M, VT)) { 10552 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 10553 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 10554 10555 if (VT == MVT::v4i32 || VT == MVT::v4f32) 10556 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 10557 } 10558 } 10559 10560 // FIXME: fold these into legal mask. 10561 if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) 10562 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 10563 10564 if (isMOVHLPSMask(M, VT)) 10565 return getMOVHighToLow(Op, dl, DAG); 10566 10567 if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) 10568 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 10569 10570 if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) 10571 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 10572 10573 if (isMOVLPMask(M, VT)) 10574 return getMOVLP(Op, dl, DAG, HasSSE2); 10575 10576 if (ShouldXformToMOVHLPS(M, VT) || 10577 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) 10578 return DAG.getCommutedVectorShuffle(*SVOp); 10579 10580 if (isShift) { 10581 // No better options. Use a vshldq / vsrldq. 10582 MVT EltVT = VT.getVectorElementType(); 10583 ShAmt *= EltVT.getSizeInBits(); 10584 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 10585 } 10586 10587 bool Commuted = false; 10588 // FIXME: This should also accept a bitcast of a splat? Be careful, not 10589 // 1,1,1,1 -> v8i16 though. 10590 BitVector UndefElements; 10591 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode())) 10592 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) 10593 V1IsSplat = true; 10594 if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode())) 10595 if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) 10596 V2IsSplat = true; 10597 10598 // Canonicalize the splat or undef, if present, to be on the RHS. 10599 if (!V2IsUndef && V1IsSplat && !V2IsSplat) { 10600 CommuteVectorShuffleMask(M, NumElems); 10601 std::swap(V1, V2); 10602 std::swap(V1IsSplat, V2IsSplat); 10603 Commuted = true; 10604 } 10605 10606 if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { 10607 // Shuffling low element of v1 into undef, just return v1. 10608 if (V2IsUndef) 10609 return V1; 10610 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 10611 // the instruction selector will not match, so get a canonical MOVL with 10612 // swapped operands to undo the commute. 10613 return getMOVL(DAG, dl, VT, V2, V1); 10614 } 10615 10616 if (isUNPCKLMask(M, VT, HasInt256)) 10617 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 10618 10619 if (isUNPCKHMask(M, VT, HasInt256)) 10620 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 10621 10622 if (V2IsSplat) { 10623 // Normalize mask so all entries that point to V2 points to its first 10624 // element then try to match unpck{h|l} again. If match, return a 10625 // new vector_shuffle with the corrected mask.p 10626 SmallVector<int, 8> NewMask(M.begin(), M.end()); 10627 NormalizeMask(NewMask, NumElems); 10628 if (isUNPCKLMask(NewMask, VT, HasInt256, true)) 10629 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 10630 if (isUNPCKHMask(NewMask, VT, HasInt256, true)) 10631 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 10632 } 10633 10634 if (Commuted) { 10635 // Commute is back and try unpck* again. 10636 // FIXME: this seems wrong. 10637 CommuteVectorShuffleMask(M, NumElems); 10638 std::swap(V1, V2); 10639 std::swap(V1IsSplat, V2IsSplat); 10640 10641 if (isUNPCKLMask(M, VT, HasInt256)) 10642 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); 10643 10644 if (isUNPCKHMask(M, VT, HasInt256)) 10645 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); 10646 } 10647 10648 // Normalize the node to match x86 shuffle ops if needed 10649 if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) 10650 return DAG.getCommutedVectorShuffle(*SVOp); 10651 10652 // The checks below are all present in isShuffleMaskLegal, but they are 10653 // inlined here right now to enable us to directly emit target specific 10654 // nodes, and remove one by one until they don't return Op anymore. 10655 10656 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 10657 SVOp->getSplatIndex() == 0 && V2IsUndef) { 10658 if (VT == MVT::v2f64 || VT == MVT::v2i64) 10659 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 10660 } 10661 10662 if (isPSHUFHWMask(M, VT, HasInt256)) 10663 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 10664 getShufflePSHUFHWImmediate(SVOp), 10665 DAG); 10666 10667 if (isPSHUFLWMask(M, VT, HasInt256)) 10668 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 10669 getShufflePSHUFLWImmediate(SVOp), 10670 DAG); 10671 10672 unsigned MaskValue; 10673 if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), 10674 &MaskValue)) 10675 return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); 10676 10677 if (isSHUFPMask(M, VT)) 10678 return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, 10679 getShuffleSHUFImmediate(SVOp), DAG); 10680 10681 if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) 10682 return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); 10683 if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) 10684 return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); 10685 10686 //===--------------------------------------------------------------------===// 10687 // Generate target specific nodes for 128 or 256-bit shuffles only 10688 // supported in the AVX instruction set. 10689 // 10690 10691 // Handle VMOVDDUPY permutations 10692 if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) 10693 return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); 10694 10695 // Handle VPERMILPS/D* permutations 10696 if (isVPERMILPMask(M, VT)) { 10697 if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) 10698 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, 10699 getShuffleSHUFImmediate(SVOp), DAG); 10700 return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, 10701 getShuffleSHUFImmediate(SVOp), DAG); 10702 } 10703 10704 unsigned Idx; 10705 if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) 10706 return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), 10707 Idx*(NumElems/2), DAG, dl); 10708 10709 // Handle VPERM2F128/VPERM2I128 permutations 10710 if (isVPERM2X128Mask(M, VT, HasFp256)) 10711 return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, 10712 V2, getShuffleVPERM2X128Immediate(SVOp), DAG); 10713 10714 if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) 10715 return getINSERTPS(SVOp, dl, DAG); 10716 10717 unsigned Imm8; 10718 if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) 10719 return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); 10720 10721 if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || 10722 VT.is512BitVector()) { 10723 MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); 10724 MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); 10725 SmallVector<SDValue, 16> permclMask; 10726 for (unsigned i = 0; i != NumElems; ++i) { 10727 permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); 10728 } 10729 10730 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); 10731 if (V2IsUndef) 10732 // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 10733 return DAG.getNode(X86ISD::VPERMV, dl, VT, 10734 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); 10735 return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1, 10736 DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2); 10737 } 10738 10739 //===--------------------------------------------------------------------===// 10740 // Since no target specific shuffle was selected for this generic one, 10741 // lower it into other known shuffles. FIXME: this isn't true yet, but 10742 // this is the plan. 10743 // 10744 10745 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 10746 if (VT == MVT::v8i16) { 10747 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); 10748 if (NewOp.getNode()) 10749 return NewOp; 10750 } 10751 10752 if (VT == MVT::v16i16 && Subtarget->hasInt256()) { 10753 SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG); 10754 if (NewOp.getNode()) 10755 return NewOp; 10756 } 10757 10758 if (VT == MVT::v16i8) { 10759 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); 10760 if (NewOp.getNode()) 10761 return NewOp; 10762 } 10763 10764 if (VT == MVT::v32i8) { 10765 SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); 10766 if (NewOp.getNode()) 10767 return NewOp; 10768 } 10769 10770 // Handle all 128-bit wide vectors with 4 elements, and match them with 10771 // several different shuffle types. 10772 if (NumElems == 4 && VT.is128BitVector()) 10773 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 10774 10775 // Handle general 256-bit shuffles 10776 if (VT.is256BitVector()) 10777 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 10778 10779 return SDValue(); 10780 } 10781 10782 // This function assumes its argument is a BUILD_VECTOR of constants or 10783 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is 10784 // true. 10785 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, 10786 unsigned &MaskValue) { 10787 MaskValue = 0; 10788 unsigned NumElems = BuildVector->getNumOperands(); 10789 // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. 10790 unsigned NumLanes = (NumElems - 1) / 8 + 1; 10791 unsigned NumElemsInLane = NumElems / NumLanes; 10792 10793 // Blend for v16i16 should be symetric for the both lanes. 10794 for (unsigned i = 0; i < NumElemsInLane; ++i) { 10795 SDValue EltCond = BuildVector->getOperand(i); 10796 SDValue SndLaneEltCond = 10797 (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; 10798 10799 int Lane1Cond = -1, Lane2Cond = -1; 10800 if (isa<ConstantSDNode>(EltCond)) 10801 Lane1Cond = !isZero(EltCond); 10802 if (isa<ConstantSDNode>(SndLaneEltCond)) 10803 Lane2Cond = !isZero(SndLaneEltCond); 10804 10805 if (Lane1Cond == Lane2Cond || Lane2Cond < 0) 10806 // Lane1Cond != 0, means we want the first argument. 10807 // Lane1Cond == 0, means we want the second argument. 10808 // The encoding of this argument is 0 for the first argument, 1 10809 // for the second. Therefore, invert the condition. 10810 MaskValue |= !Lane1Cond << i; 10811 else if (Lane1Cond < 0) 10812 MaskValue |= !Lane2Cond << i; 10813 else 10814 return false; 10815 } 10816 return true; 10817 } 10818 10819 // Try to lower a vselect node into a simple blend instruction. 10820 static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, 10821 SelectionDAG &DAG) { 10822 SDValue Cond = Op.getOperand(0); 10823 SDValue LHS = Op.getOperand(1); 10824 SDValue RHS = Op.getOperand(2); 10825 SDLoc dl(Op); 10826 MVT VT = Op.getSimpleValueType(); 10827 MVT EltVT = VT.getVectorElementType(); 10828 unsigned NumElems = VT.getVectorNumElements(); 10829 10830 // There is no blend with immediate in AVX-512. 10831 if (VT.is512BitVector()) 10832 return SDValue(); 10833 10834 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 10835 return SDValue(); 10836 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 10837 return SDValue(); 10838 10839 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 10840 return SDValue(); 10841 10842 // Check the mask for BLEND and build the value. 10843 unsigned MaskValue = 0; 10844 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) 10845 return SDValue(); 10846 10847 // Convert i32 vectors to floating point if it is not AVX2. 10848 // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. 10849 MVT BlendVT = VT; 10850 if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { 10851 BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), 10852 NumElems); 10853 LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); 10854 RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); 10855 } 10856 10857 SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, 10858 DAG.getConstant(MaskValue, MVT::i32)); 10859 return DAG.getNode(ISD::BITCAST, dl, VT, Ret); 10860 } 10861 10862 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { 10863 // A vselect where all conditions and data are constants can be optimized into 10864 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 10865 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && 10866 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && 10867 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) 10868 return SDValue(); 10869 10870 SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); 10871 if (BlendOp.getNode()) 10872 return BlendOp; 10873 10874 // Some types for vselect were previously set to Expand, not Legal or 10875 // Custom. Return an empty SDValue so we fall-through to Expand, after 10876 // the Custom lowering phase. 10877 MVT VT = Op.getSimpleValueType(); 10878 switch (VT.SimpleTy) { 10879 default: 10880 break; 10881 case MVT::v8i16: 10882 case MVT::v16i16: 10883 if (Subtarget->hasBWI() && Subtarget->hasVLX()) 10884 break; 10885 return SDValue(); 10886 } 10887 10888 // We couldn't create a "Blend with immediate" node. 10889 // This node should still be legal, but we'll have to emit a blendv* 10890 // instruction. 10891 return Op; 10892 } 10893 10894 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { 10895 MVT VT = Op.getSimpleValueType(); 10896 SDLoc dl(Op); 10897 10898 if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) 10899 return SDValue(); 10900 10901 if (VT.getSizeInBits() == 8) { 10902 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 10903 Op.getOperand(0), Op.getOperand(1)); 10904 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 10905 DAG.getValueType(VT)); 10906 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 10907 } 10908 10909 if (VT.getSizeInBits() == 16) { 10910 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 10911 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 10912 if (Idx == 0) 10913 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 10914 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 10915 DAG.getNode(ISD::BITCAST, dl, 10916 MVT::v4i32, 10917 Op.getOperand(0)), 10918 Op.getOperand(1))); 10919 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 10920 Op.getOperand(0), Op.getOperand(1)); 10921 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 10922 DAG.getValueType(VT)); 10923 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 10924 } 10925 10926 if (VT == MVT::f32) { 10927 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 10928 // the result back to FR32 register. It's only worth matching if the 10929 // result has a single use which is a store or a bitcast to i32. And in 10930 // the case of a store, it's not worth it if the index is a constant 0, 10931 // because a MOVSSmr can be used instead, which is smaller and faster. 10932 if (!Op.hasOneUse()) 10933 return SDValue(); 10934 SDNode *User = *Op.getNode()->use_begin(); 10935 if ((User->getOpcode() != ISD::STORE || 10936 (isa<ConstantSDNode>(Op.getOperand(1)) && 10937 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 10938 (User->getOpcode() != ISD::BITCAST || 10939 User->getValueType(0) != MVT::i32)) 10940 return SDValue(); 10941 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 10942 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 10943 Op.getOperand(0)), 10944 Op.getOperand(1)); 10945 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 10946 } 10947 10948 if (VT == MVT::i32 || VT == MVT::i64) { 10949 // ExtractPS/pextrq works with constant index. 10950 if (isa<ConstantSDNode>(Op.getOperand(1))) 10951 return Op; 10952 } 10953 return SDValue(); 10954 } 10955 10956 /// Extract one bit from mask vector, like v16i1 or v8i1. 10957 /// AVX-512 feature. 10958 SDValue 10959 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { 10960 SDValue Vec = Op.getOperand(0); 10961 SDLoc dl(Vec); 10962 MVT VecVT = Vec.getSimpleValueType(); 10963 SDValue Idx = Op.getOperand(1); 10964 MVT EltVT = Op.getSimpleValueType(); 10965 10966 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); 10967 10968 // variable index can't be handled in mask registers, 10969 // extend vector to VR512 10970 if (!isa<ConstantSDNode>(Idx)) { 10971 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 10972 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); 10973 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 10974 ExtVT.getVectorElementType(), Ext, Idx); 10975 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 10976 } 10977 10978 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 10979 const TargetRegisterClass* rc = getRegClassFor(VecVT); 10980 unsigned MaxSift = rc->getSize()*8 - 1; 10981 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, 10982 DAG.getConstant(MaxSift - IdxVal, MVT::i8)); 10983 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, 10984 DAG.getConstant(MaxSift, MVT::i8)); 10985 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, 10986 DAG.getIntPtrConstant(0)); 10987 } 10988 10989 SDValue 10990 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 10991 SelectionDAG &DAG) const { 10992 SDLoc dl(Op); 10993 SDValue Vec = Op.getOperand(0); 10994 MVT VecVT = Vec.getSimpleValueType(); 10995 SDValue Idx = Op.getOperand(1); 10996 10997 if (Op.getSimpleValueType() == MVT::i1) 10998 return ExtractBitFromMaskVector(Op, DAG); 10999 11000 if (!isa<ConstantSDNode>(Idx)) { 11001 if (VecVT.is512BitVector() || 11002 (VecVT.is256BitVector() && Subtarget->hasInt256() && 11003 VecVT.getVectorElementType().getSizeInBits() == 32)) { 11004 11005 MVT MaskEltVT = 11006 MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); 11007 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / 11008 MaskEltVT.getSizeInBits()); 11009 11010 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); 11011 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, 11012 getZeroVector(MaskVT, Subtarget, DAG, dl), 11013 Idx, DAG.getConstant(0, getPointerTy())); 11014 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); 11015 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), 11016 Perm, DAG.getConstant(0, getPointerTy())); 11017 } 11018 return SDValue(); 11019 } 11020 11021 // If this is a 256-bit vector result, first extract the 128-bit vector and 11022 // then extract the element from the 128-bit vector. 11023 if (VecVT.is256BitVector() || VecVT.is512BitVector()) { 11024 11025 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11026 // Get the 128-bit vector. 11027 Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); 11028 MVT EltVT = VecVT.getVectorElementType(); 11029 11030 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); 11031 11032 //if (IdxVal >= NumElems/2) 11033 // IdxVal -= NumElems/2; 11034 IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; 11035 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 11036 DAG.getConstant(IdxVal, MVT::i32)); 11037 } 11038 11039 assert(VecVT.is128BitVector() && "Unexpected vector length"); 11040 11041 if (Subtarget->hasSSE41()) { 11042 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 11043 if (Res.getNode()) 11044 return Res; 11045 } 11046 11047 MVT VT = Op.getSimpleValueType(); 11048 // TODO: handle v16i8. 11049 if (VT.getSizeInBits() == 16) { 11050 SDValue Vec = Op.getOperand(0); 11051 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11052 if (Idx == 0) 11053 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 11054 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 11055 DAG.getNode(ISD::BITCAST, dl, 11056 MVT::v4i32, Vec), 11057 Op.getOperand(1))); 11058 // Transform it so it match pextrw which produces a 32-bit result. 11059 MVT EltVT = MVT::i32; 11060 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 11061 Op.getOperand(0), Op.getOperand(1)); 11062 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 11063 DAG.getValueType(VT)); 11064 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 11065 } 11066 11067 if (VT.getSizeInBits() == 32) { 11068 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11069 if (Idx == 0) 11070 return Op; 11071 11072 // SHUFPS the element to the lowest double word, then movss. 11073 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 11074 MVT VVT = Op.getOperand(0).getSimpleValueType(); 11075 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 11076 DAG.getUNDEF(VVT), Mask); 11077 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 11078 DAG.getIntPtrConstant(0)); 11079 } 11080 11081 if (VT.getSizeInBits() == 64) { 11082 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 11083 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 11084 // to match extract_elt for f64. 11085 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 11086 if (Idx == 0) 11087 return Op; 11088 11089 // UNPCKHPD the element to the lowest double word, then movsd. 11090 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 11091 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 11092 int Mask[2] = { 1, -1 }; 11093 MVT VVT = Op.getOperand(0).getSimpleValueType(); 11094 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 11095 DAG.getUNDEF(VVT), Mask); 11096 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 11097 DAG.getIntPtrConstant(0)); 11098 } 11099 11100 return SDValue(); 11101 } 11102 11103 /// Insert one bit to mask vector, like v16i1 or v8i1. 11104 /// AVX-512 feature. 11105 SDValue 11106 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { 11107 SDLoc dl(Op); 11108 SDValue Vec = Op.getOperand(0); 11109 SDValue Elt = Op.getOperand(1); 11110 SDValue Idx = Op.getOperand(2); 11111 MVT VecVT = Vec.getSimpleValueType(); 11112 11113 if (!isa<ConstantSDNode>(Idx)) { 11114 // Non constant index. Extend source and destination, 11115 // insert element and then truncate the result. 11116 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); 11117 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); 11118 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 11119 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), 11120 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); 11121 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); 11122 } 11123 11124 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11125 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); 11126 if (Vec.getOpcode() == ISD::UNDEF) 11127 return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, 11128 DAG.getConstant(IdxVal, MVT::i8)); 11129 const TargetRegisterClass* rc = getRegClassFor(VecVT); 11130 unsigned MaxSift = rc->getSize()*8 - 1; 11131 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, 11132 DAG.getConstant(MaxSift, MVT::i8)); 11133 EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, 11134 DAG.getConstant(MaxSift - IdxVal, MVT::i8)); 11135 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); 11136 } 11137 11138 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 11139 SelectionDAG &DAG) const { 11140 MVT VT = Op.getSimpleValueType(); 11141 MVT EltVT = VT.getVectorElementType(); 11142 11143 if (EltVT == MVT::i1) 11144 return InsertBitToMaskVector(Op, DAG); 11145 11146 SDLoc dl(Op); 11147 SDValue N0 = Op.getOperand(0); 11148 SDValue N1 = Op.getOperand(1); 11149 SDValue N2 = Op.getOperand(2); 11150 if (!isa<ConstantSDNode>(N2)) 11151 return SDValue(); 11152 auto *N2C = cast<ConstantSDNode>(N2); 11153 unsigned IdxVal = N2C->getZExtValue(); 11154 11155 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert 11156 // into that, and then insert the subvector back into the result. 11157 if (VT.is256BitVector() || VT.is512BitVector()) { 11158 // Get the desired 128-bit vector half. 11159 SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); 11160 11161 // Insert the element into the desired half. 11162 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); 11163 unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; 11164 11165 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, 11166 DAG.getConstant(IdxIn128, MVT::i32)); 11167 11168 // Insert the changed part back to the 256-bit vector 11169 return Insert128BitVector(N0, V, IdxVal, DAG, dl); 11170 } 11171 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); 11172 11173 if (Subtarget->hasSSE41()) { 11174 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { 11175 unsigned Opc; 11176 if (VT == MVT::v8i16) { 11177 Opc = X86ISD::PINSRW; 11178 } else { 11179 assert(VT == MVT::v16i8); 11180 Opc = X86ISD::PINSRB; 11181 } 11182 11183 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 11184 // argument. 11185 if (N1.getValueType() != MVT::i32) 11186 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 11187 if (N2.getValueType() != MVT::i32) 11188 N2 = DAG.getIntPtrConstant(IdxVal); 11189 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 11190 } 11191 11192 if (EltVT == MVT::f32) { 11193 // Bits [7:6] of the constant are the source select. This will always be 11194 // zero here. The DAG Combiner may combine an extract_elt index into 11195 // these 11196 // bits. For example (insert (extract, 3), 2) could be matched by 11197 // putting 11198 // the '3' into bits [7:6] of X86ISD::INSERTPS. 11199 // Bits [5:4] of the constant are the destination select. This is the 11200 // value of the incoming immediate. 11201 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 11202 // combine either bitwise AND or insert of float 0.0 to set these bits. 11203 N2 = DAG.getIntPtrConstant(IdxVal << 4); 11204 // Create this as a scalar to vector.. 11205 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 11206 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 11207 } 11208 11209 if (EltVT == MVT::i32 || EltVT == MVT::i64) { 11210 // PINSR* works with constant index. 11211 return Op; 11212 } 11213 } 11214 11215 if (EltVT == MVT::i8) 11216 return SDValue(); 11217 11218 if (EltVT.getSizeInBits() == 16) { 11219 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 11220 // as its second argument. 11221 if (N1.getValueType() != MVT::i32) 11222 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 11223 if (N2.getValueType() != MVT::i32) 11224 N2 = DAG.getIntPtrConstant(IdxVal); 11225 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 11226 } 11227 return SDValue(); 11228 } 11229 11230 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 11231 SDLoc dl(Op); 11232 MVT OpVT = Op.getSimpleValueType(); 11233 11234 // If this is a 256-bit vector result, first insert into a 128-bit 11235 // vector and then insert into the 256-bit vector. 11236 if (!OpVT.is128BitVector()) { 11237 // Insert into a 128-bit vector. 11238 unsigned SizeFactor = OpVT.getSizeInBits()/128; 11239 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), 11240 OpVT.getVectorNumElements() / SizeFactor); 11241 11242 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 11243 11244 // Insert the 128-bit vector. 11245 return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); 11246 } 11247 11248 if (OpVT == MVT::v1i64 && 11249 Op.getOperand(0).getValueType() == MVT::i64) 11250 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 11251 11252 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 11253 assert(OpVT.is128BitVector() && "Expected an SSE type!"); 11254 return DAG.getNode(ISD::BITCAST, dl, OpVT, 11255 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 11256 } 11257 11258 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 11259 // a simple subregister reference or explicit instructions to grab 11260 // upper bits of a vector. 11261 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 11262 SelectionDAG &DAG) { 11263 SDLoc dl(Op); 11264 SDValue In = Op.getOperand(0); 11265 SDValue Idx = Op.getOperand(1); 11266 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11267 MVT ResVT = Op.getSimpleValueType(); 11268 MVT InVT = In.getSimpleValueType(); 11269 11270 if (Subtarget->hasFp256()) { 11271 if (ResVT.is128BitVector() && 11272 (InVT.is256BitVector() || InVT.is512BitVector()) && 11273 isa<ConstantSDNode>(Idx)) { 11274 return Extract128BitVector(In, IdxVal, DAG, dl); 11275 } 11276 if (ResVT.is256BitVector() && InVT.is512BitVector() && 11277 isa<ConstantSDNode>(Idx)) { 11278 return Extract256BitVector(In, IdxVal, DAG, dl); 11279 } 11280 } 11281 return SDValue(); 11282 } 11283 11284 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 11285 // simple superregister reference or explicit instructions to insert 11286 // the upper bits of a vector. 11287 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, 11288 SelectionDAG &DAG) { 11289 if (Subtarget->hasFp256()) { 11290 SDLoc dl(Op.getNode()); 11291 SDValue Vec = Op.getNode()->getOperand(0); 11292 SDValue SubVec = Op.getNode()->getOperand(1); 11293 SDValue Idx = Op.getNode()->getOperand(2); 11294 11295 if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || 11296 Op.getNode()->getSimpleValueType(0).is512BitVector()) && 11297 SubVec.getNode()->getSimpleValueType(0).is128BitVector() && 11298 isa<ConstantSDNode>(Idx)) { 11299 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11300 return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); 11301 } 11302 11303 if (Op.getNode()->getSimpleValueType(0).is512BitVector() && 11304 SubVec.getNode()->getSimpleValueType(0).is256BitVector() && 11305 isa<ConstantSDNode>(Idx)) { 11306 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 11307 return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); 11308 } 11309 } 11310 return SDValue(); 11311 } 11312 11313 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 11314 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 11315 // one of the above mentioned nodes. It has to be wrapped because otherwise 11316 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 11317 // be used to form addressing mode. These wrapped nodes will be selected 11318 // into MOV32ri. 11319 SDValue 11320 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 11321 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 11322 11323 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 11324 // global base reg. 11325 unsigned char OpFlag = 0; 11326 unsigned WrapperKind = X86ISD::Wrapper; 11327 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11328 11329 if (Subtarget->isPICStyleRIPRel() && 11330 (M == CodeModel::Small || M == CodeModel::Kernel)) 11331 WrapperKind = X86ISD::WrapperRIP; 11332 else if (Subtarget->isPICStyleGOT()) 11333 OpFlag = X86II::MO_GOTOFF; 11334 else if (Subtarget->isPICStyleStubPIC()) 11335 OpFlag = X86II::MO_PIC_BASE_OFFSET; 11336 11337 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 11338 CP->getAlignment(), 11339 CP->getOffset(), OpFlag); 11340 SDLoc DL(CP); 11341 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 11342 // With PIC, the address is actually $g + Offset. 11343 if (OpFlag) { 11344 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 11345 DAG.getNode(X86ISD::GlobalBaseReg, 11346 SDLoc(), getPointerTy()), 11347 Result); 11348 } 11349 11350 return Result; 11351 } 11352 11353 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 11354 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 11355 11356 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 11357 // global base reg. 11358 unsigned char OpFlag = 0; 11359 unsigned WrapperKind = X86ISD::Wrapper; 11360 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11361 11362 if (Subtarget->isPICStyleRIPRel() && 11363 (M == CodeModel::Small || M == CodeModel::Kernel)) 11364 WrapperKind = X86ISD::WrapperRIP; 11365 else if (Subtarget->isPICStyleGOT()) 11366 OpFlag = X86II::MO_GOTOFF; 11367 else if (Subtarget->isPICStyleStubPIC()) 11368 OpFlag = X86II::MO_PIC_BASE_OFFSET; 11369 11370 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 11371 OpFlag); 11372 SDLoc DL(JT); 11373 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 11374 11375 // With PIC, the address is actually $g + Offset. 11376 if (OpFlag) 11377 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 11378 DAG.getNode(X86ISD::GlobalBaseReg, 11379 SDLoc(), getPointerTy()), 11380 Result); 11381 11382 return Result; 11383 } 11384 11385 SDValue 11386 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 11387 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 11388 11389 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 11390 // global base reg. 11391 unsigned char OpFlag = 0; 11392 unsigned WrapperKind = X86ISD::Wrapper; 11393 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11394 11395 if (Subtarget->isPICStyleRIPRel() && 11396 (M == CodeModel::Small || M == CodeModel::Kernel)) { 11397 if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) 11398 OpFlag = X86II::MO_GOTPCREL; 11399 WrapperKind = X86ISD::WrapperRIP; 11400 } else if (Subtarget->isPICStyleGOT()) { 11401 OpFlag = X86II::MO_GOT; 11402 } else if (Subtarget->isPICStyleStubPIC()) { 11403 OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; 11404 } else if (Subtarget->isPICStyleStubNoDynamic()) { 11405 OpFlag = X86II::MO_DARWIN_NONLAZY; 11406 } 11407 11408 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 11409 11410 SDLoc DL(Op); 11411 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 11412 11413 // With PIC, the address is actually $g + Offset. 11414 if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && 11415 !Subtarget->is64Bit()) { 11416 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 11417 DAG.getNode(X86ISD::GlobalBaseReg, 11418 SDLoc(), getPointerTy()), 11419 Result); 11420 } 11421 11422 // For symbols that require a load from a stub to get the address, emit the 11423 // load. 11424 if (isGlobalStubReference(OpFlag)) 11425 Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, 11426 MachinePointerInfo::getGOT(), false, false, false, 0); 11427 11428 return Result; 11429 } 11430 11431 SDValue 11432 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 11433 // Create the TargetBlockAddressAddress node. 11434 unsigned char OpFlags = 11435 Subtarget->ClassifyBlockAddressReference(); 11436 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11437 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 11438 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); 11439 SDLoc dl(Op); 11440 SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, 11441 OpFlags); 11442 11443 if (Subtarget->isPICStyleRIPRel() && 11444 (M == CodeModel::Small || M == CodeModel::Kernel)) 11445 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 11446 else 11447 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 11448 11449 // With PIC, the address is actually $g + Offset. 11450 if (isGlobalRelativeToPICBase(OpFlags)) { 11451 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 11452 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 11453 Result); 11454 } 11455 11456 return Result; 11457 } 11458 11459 SDValue 11460 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, 11461 int64_t Offset, SelectionDAG &DAG) const { 11462 // Create the TargetGlobalAddress node, folding in the constant 11463 // offset if it is legal. 11464 unsigned char OpFlags = 11465 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); 11466 CodeModel::Model M = DAG.getTarget().getCodeModel(); 11467 SDValue Result; 11468 if (OpFlags == X86II::MO_NO_FLAG && 11469 X86::isOffsetSuitableForCodeModel(Offset, M)) { 11470 // A direct static reference to a global. 11471 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 11472 Offset = 0; 11473 } else { 11474 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 11475 } 11476 11477 if (Subtarget->isPICStyleRIPRel() && 11478 (M == CodeModel::Small || M == CodeModel::Kernel)) 11479 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 11480 else 11481 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 11482 11483 // With PIC, the address is actually $g + Offset. 11484 if (isGlobalRelativeToPICBase(OpFlags)) { 11485 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 11486 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 11487 Result); 11488 } 11489 11490 // For globals that require a load from a stub to get the address, emit the 11491 // load. 11492 if (isGlobalStubReference(OpFlags)) 11493 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 11494 MachinePointerInfo::getGOT(), false, false, false, 0); 11495 11496 // If there was a non-zero offset that we didn't fold, create an explicit 11497 // addition for it. 11498 if (Offset != 0) 11499 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 11500 DAG.getConstant(Offset, getPointerTy())); 11501 11502 return Result; 11503 } 11504 11505 SDValue 11506 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 11507 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 11508 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 11509 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); 11510 } 11511 11512 static SDValue 11513 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 11514 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 11515 unsigned char OperandFlags, bool LocalDynamic = false) { 11516 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11517 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 11518 SDLoc dl(GA); 11519 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 11520 GA->getValueType(0), 11521 GA->getOffset(), 11522 OperandFlags); 11523 11524 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR 11525 : X86ISD::TLSADDR; 11526 11527 if (InFlag) { 11528 SDValue Ops[] = { Chain, TGA, *InFlag }; 11529 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 11530 } else { 11531 SDValue Ops[] = { Chain, TGA }; 11532 Chain = DAG.getNode(CallType, dl, NodeTys, Ops); 11533 } 11534 11535 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 11536 MFI->setAdjustsStack(true); 11537 11538 SDValue Flag = Chain.getValue(1); 11539 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 11540 } 11541 11542 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 11543 static SDValue 11544 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 11545 const EVT PtrVT) { 11546 SDValue InFlag; 11547 SDLoc dl(GA); // ? function entry point might be better 11548 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 11549 DAG.getNode(X86ISD::GlobalBaseReg, 11550 SDLoc(), PtrVT), InFlag); 11551 InFlag = Chain.getValue(1); 11552 11553 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 11554 } 11555 11556 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 11557 static SDValue 11558 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 11559 const EVT PtrVT) { 11560 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, 11561 X86::RAX, X86II::MO_TLSGD); 11562 } 11563 11564 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, 11565 SelectionDAG &DAG, 11566 const EVT PtrVT, 11567 bool is64Bit) { 11568 SDLoc dl(GA); 11569 11570 // Get the start address of the TLS block for this module. 11571 X86MachineFunctionInfo* MFI = DAG.getMachineFunction() 11572 .getInfo<X86MachineFunctionInfo>(); 11573 MFI->incNumLocalDynamicTLSAccesses(); 11574 11575 SDValue Base; 11576 if (is64Bit) { 11577 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, 11578 X86II::MO_TLSLD, /*LocalDynamic=*/true); 11579 } else { 11580 SDValue InFlag; 11581 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 11582 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); 11583 InFlag = Chain.getValue(1); 11584 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, 11585 X86II::MO_TLSLDM, /*LocalDynamic=*/true); 11586 } 11587 11588 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations 11589 // of Base. 11590 11591 // Build x@dtpoff. 11592 unsigned char OperandFlags = X86II::MO_DTPOFF; 11593 unsigned WrapperKind = X86ISD::Wrapper; 11594 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 11595 GA->getValueType(0), 11596 GA->getOffset(), OperandFlags); 11597 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 11598 11599 // Add x@dtpoff with the base. 11600 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); 11601 } 11602 11603 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. 11604 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 11605 const EVT PtrVT, TLSModel::Model model, 11606 bool is64Bit, bool isPIC) { 11607 SDLoc dl(GA); 11608 11609 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 11610 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 11611 is64Bit ? 257 : 256)); 11612 11613 SDValue ThreadPointer = 11614 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), 11615 MachinePointerInfo(Ptr), false, false, false, 0); 11616 11617 unsigned char OperandFlags = 0; 11618 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 11619 // initialexec. 11620 unsigned WrapperKind = X86ISD::Wrapper; 11621 if (model == TLSModel::LocalExec) { 11622 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 11623 } else if (model == TLSModel::InitialExec) { 11624 if (is64Bit) { 11625 OperandFlags = X86II::MO_GOTTPOFF; 11626 WrapperKind = X86ISD::WrapperRIP; 11627 } else { 11628 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; 11629 } 11630 } else { 11631 llvm_unreachable("Unexpected model"); 11632 } 11633 11634 // emit "addl x@ntpoff,%eax" (local exec) 11635 // or "addl x@indntpoff,%eax" (initial exec) 11636 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) 11637 SDValue TGA = 11638 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), 11639 GA->getOffset(), OperandFlags); 11640 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 11641 11642 if (model == TLSModel::InitialExec) { 11643 if (isPIC && !is64Bit) { 11644 Offset = DAG.getNode(ISD::ADD, dl, PtrVT, 11645 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), 11646 Offset); 11647 } 11648 11649 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 11650 MachinePointerInfo::getGOT(), false, false, false, 0); 11651 } 11652 11653 // The address of the thread local variable is the add of the thread 11654 // pointer with the offset of the variable. 11655 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 11656 } 11657 11658 SDValue 11659 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 11660 11661 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 11662 const GlobalValue *GV = GA->getGlobal(); 11663 11664 if (Subtarget->isTargetELF()) { 11665 TLSModel::Model model = DAG.getTarget().getTLSModel(GV); 11666 11667 switch (model) { 11668 case TLSModel::GeneralDynamic: 11669 if (Subtarget->is64Bit()) 11670 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 11671 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 11672 case TLSModel::LocalDynamic: 11673 return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), 11674 Subtarget->is64Bit()); 11675 case TLSModel::InitialExec: 11676 case TLSModel::LocalExec: 11677 return LowerToTLSExecModel( 11678 GA, DAG, getPointerTy(), model, Subtarget->is64Bit(), 11679 DAG.getTarget().getRelocationModel() == Reloc::PIC_); 11680 } 11681 llvm_unreachable("Unknown TLS model."); 11682 } 11683 11684 if (Subtarget->isTargetDarwin()) { 11685 // Darwin only has one model of TLS. Lower to that. 11686 unsigned char OpFlag = 0; 11687 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 11688 X86ISD::WrapperRIP : X86ISD::Wrapper; 11689 11690 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 11691 // global base reg. 11692 bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && 11693 !Subtarget->is64Bit(); 11694 if (PIC32) 11695 OpFlag = X86II::MO_TLVP_PIC_BASE; 11696 else 11697 OpFlag = X86II::MO_TLVP; 11698 SDLoc DL(Op); 11699 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 11700 GA->getValueType(0), 11701 GA->getOffset(), OpFlag); 11702 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 11703 11704 // With PIC32, the address is actually $g + Offset. 11705 if (PIC32) 11706 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 11707 DAG.getNode(X86ISD::GlobalBaseReg, 11708 SDLoc(), getPointerTy()), 11709 Offset); 11710 11711 // Lowering the machine isd will make sure everything is in the right 11712 // location. 11713 SDValue Chain = DAG.getEntryNode(); 11714 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 11715 SDValue Args[] = { Chain, Offset }; 11716 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); 11717 11718 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 11719 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 11720 MFI->setAdjustsStack(true); 11721 11722 // And our return value (tls address) is in the standard call return value 11723 // location. 11724 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 11725 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), 11726 Chain.getValue(1)); 11727 } 11728 11729 if (Subtarget->isTargetKnownWindowsMSVC() || 11730 Subtarget->isTargetWindowsGNU()) { 11731 // Just use the implicit TLS architecture 11732 // Need to generate someting similar to: 11733 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage 11734 // ; from TEB 11735 // mov ecx, dword [rel _tls_index]: Load index (from C runtime) 11736 // mov rcx, qword [rdx+rcx*8] 11737 // mov eax, .tls$:tlsvar 11738 // [rax+rcx] contains the address 11739 // Windows 64bit: gs:0x58 11740 // Windows 32bit: fs:__tls_array 11741 11742 SDLoc dl(GA); 11743 SDValue Chain = DAG.getEntryNode(); 11744 11745 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or 11746 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly 11747 // use its literal value of 0x2C. 11748 Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() 11749 ? Type::getInt8PtrTy(*DAG.getContext(), 11750 256) 11751 : Type::getInt32PtrTy(*DAG.getContext(), 11752 257)); 11753 11754 SDValue TlsArray = 11755 Subtarget->is64Bit() 11756 ? DAG.getIntPtrConstant(0x58) 11757 : (Subtarget->isTargetWindowsGNU() 11758 ? DAG.getIntPtrConstant(0x2C) 11759 : DAG.getExternalSymbol("_tls_array", getPointerTy())); 11760 11761 SDValue ThreadPointer = 11762 DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, 11763 MachinePointerInfo(Ptr), false, false, false, 0); 11764 11765 // Load the _tls_index variable 11766 SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); 11767 if (Subtarget->is64Bit()) 11768 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, 11769 IDX, MachinePointerInfo(), MVT::i32, 11770 false, false, false, 0); 11771 else 11772 IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), 11773 false, false, false, 0); 11774 11775 SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), 11776 getPointerTy()); 11777 IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); 11778 11779 SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); 11780 res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), 11781 false, false, false, 0); 11782 11783 // Get the offset of start of .tls section 11784 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 11785 GA->getValueType(0), 11786 GA->getOffset(), X86II::MO_SECREL); 11787 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); 11788 11789 // The address of the thread local variable is the add of the thread 11790 // pointer with the offset of the variable. 11791 return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); 11792 } 11793 11794 llvm_unreachable("TLS not implemented for this target."); 11795 } 11796 11797 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values 11798 /// and take a 2 x i32 value to shift plus a shift amount. 11799 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { 11800 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 11801 MVT VT = Op.getSimpleValueType(); 11802 unsigned VTBits = VT.getSizeInBits(); 11803 SDLoc dl(Op); 11804 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 11805 SDValue ShOpLo = Op.getOperand(0); 11806 SDValue ShOpHi = Op.getOperand(1); 11807 SDValue ShAmt = Op.getOperand(2); 11808 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the 11809 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away 11810 // during isel. 11811 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 11812 DAG.getConstant(VTBits - 1, MVT::i8)); 11813 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 11814 DAG.getConstant(VTBits - 1, MVT::i8)) 11815 : DAG.getConstant(0, VT); 11816 11817 SDValue Tmp2, Tmp3; 11818 if (Op.getOpcode() == ISD::SHL_PARTS) { 11819 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 11820 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); 11821 } else { 11822 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 11823 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); 11824 } 11825 11826 // If the shift amount is larger or equal than the width of a part we can't 11827 // rely on the results of shld/shrd. Insert a test and select the appropriate 11828 // values for large shift amounts. 11829 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 11830 DAG.getConstant(VTBits, MVT::i8)); 11831 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 11832 AndNode, DAG.getConstant(0, MVT::i8)); 11833 11834 SDValue Hi, Lo; 11835 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 11836 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 11837 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 11838 11839 if (Op.getOpcode() == ISD::SHL_PARTS) { 11840 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 11841 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 11842 } else { 11843 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); 11844 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); 11845 } 11846 11847 SDValue Ops[2] = { Lo, Hi }; 11848 return DAG.getMergeValues(Ops, dl); 11849 } 11850 11851 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 11852 SelectionDAG &DAG) const { 11853 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 11854 11855 if (SrcVT.isVector()) 11856 return SDValue(); 11857 11858 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && 11859 "Unknown SINT_TO_FP to lower!"); 11860 11861 // These are really Legal; return the operand so the caller accepts it as 11862 // Legal. 11863 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 11864 return Op; 11865 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 11866 Subtarget->is64Bit()) { 11867 return Op; 11868 } 11869 11870 SDLoc dl(Op); 11871 unsigned Size = SrcVT.getSizeInBits()/8; 11872 MachineFunction &MF = DAG.getMachineFunction(); 11873 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 11874 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 11875 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 11876 StackSlot, 11877 MachinePointerInfo::getFixedStack(SSFI), 11878 false, false, 0); 11879 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 11880 } 11881 11882 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 11883 SDValue StackSlot, 11884 SelectionDAG &DAG) const { 11885 // Build the FILD 11886 SDLoc DL(Op); 11887 SDVTList Tys; 11888 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 11889 if (useSSE) 11890 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 11891 else 11892 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 11893 11894 unsigned ByteSize = SrcVT.getSizeInBits()/8; 11895 11896 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 11897 MachineMemOperand *MMO; 11898 if (FI) { 11899 int SSFI = FI->getIndex(); 11900 MMO = 11901 DAG.getMachineFunction() 11902 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11903 MachineMemOperand::MOLoad, ByteSize, ByteSize); 11904 } else { 11905 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 11906 StackSlot = StackSlot.getOperand(1); 11907 } 11908 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 11909 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 11910 X86ISD::FILD, DL, 11911 Tys, Ops, SrcVT, MMO); 11912 11913 if (useSSE) { 11914 Chain = Result.getValue(1); 11915 SDValue InFlag = Result.getValue(2); 11916 11917 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 11918 // shouldn't be necessary except that RFP cannot be live across 11919 // multiple blocks. When stackifier is fixed, they can be uncoupled. 11920 MachineFunction &MF = DAG.getMachineFunction(); 11921 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 11922 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 11923 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 11924 Tys = DAG.getVTList(MVT::Other); 11925 SDValue Ops[] = { 11926 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 11927 }; 11928 MachineMemOperand *MMO = 11929 DAG.getMachineFunction() 11930 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 11931 MachineMemOperand::MOStore, SSFISize, SSFISize); 11932 11933 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 11934 Ops, Op.getValueType(), MMO); 11935 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 11936 MachinePointerInfo::getFixedStack(SSFI), 11937 false, false, false, 0); 11938 } 11939 11940 return Result; 11941 } 11942 11943 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 11944 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 11945 SelectionDAG &DAG) const { 11946 // This algorithm is not obvious. Here it is what we're trying to output: 11947 /* 11948 movq %rax, %xmm0 11949 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } 11950 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } 11951 #ifdef __SSE3__ 11952 haddpd %xmm0, %xmm0 11953 #else 11954 pshufd $0x4e, %xmm0, %xmm1 11955 addpd %xmm1, %xmm0 11956 #endif 11957 */ 11958 11959 SDLoc dl(Op); 11960 LLVMContext *Context = DAG.getContext(); 11961 11962 // Build some magic constants. 11963 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; 11964 Constant *C0 = ConstantDataVector::get(*Context, CV0); 11965 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 11966 11967 SmallVector<Constant*,2> CV1; 11968 CV1.push_back( 11969 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 11970 APInt(64, 0x4330000000000000ULL)))); 11971 CV1.push_back( 11972 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, 11973 APInt(64, 0x4530000000000000ULL)))); 11974 Constant *C1 = ConstantVector::get(CV1); 11975 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 11976 11977 // Load the 64-bit value into an XMM register. 11978 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 11979 Op.getOperand(0)); 11980 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 11981 MachinePointerInfo::getConstantPool(), 11982 false, false, false, 16); 11983 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, 11984 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), 11985 CLod0); 11986 11987 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 11988 MachinePointerInfo::getConstantPool(), 11989 false, false, false, 16); 11990 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); 11991 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 11992 SDValue Result; 11993 11994 if (Subtarget->hasSSE3()) { 11995 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. 11996 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); 11997 } else { 11998 SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); 11999 SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, 12000 S2F, 0x4E, DAG); 12001 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, 12002 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), 12003 Sub); 12004 } 12005 12006 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, 12007 DAG.getIntPtrConstant(0)); 12008 } 12009 12010 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 12011 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 12012 SelectionDAG &DAG) const { 12013 SDLoc dl(Op); 12014 // FP constant to bias correct the final result. 12015 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 12016 MVT::f64); 12017 12018 // Load the 32-bit value into an XMM register. 12019 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 12020 Op.getOperand(0)); 12021 12022 // Zero out the upper parts of the register. 12023 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); 12024 12025 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 12026 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 12027 DAG.getIntPtrConstant(0)); 12028 12029 // Or the load with the bias. 12030 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 12031 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 12032 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 12033 MVT::v2f64, Load)), 12034 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 12035 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 12036 MVT::v2f64, Bias))); 12037 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 12038 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 12039 DAG.getIntPtrConstant(0)); 12040 12041 // Subtract the bias. 12042 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 12043 12044 // Handle final rounding. 12045 EVT DestVT = Op.getValueType(); 12046 12047 if (DestVT.bitsLT(MVT::f64)) 12048 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 12049 DAG.getIntPtrConstant(0)); 12050 if (DestVT.bitsGT(MVT::f64)) 12051 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 12052 12053 // Handle final rounding. 12054 return Sub; 12055 } 12056 12057 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, 12058 SelectionDAG &DAG) const { 12059 SDValue N0 = Op.getOperand(0); 12060 MVT SVT = N0.getSimpleValueType(); 12061 SDLoc dl(Op); 12062 12063 assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || 12064 SVT == MVT::v8i8 || SVT == MVT::v8i16) && 12065 "Custom UINT_TO_FP is not supported!"); 12066 12067 MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); 12068 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), 12069 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); 12070 } 12071 12072 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 12073 SelectionDAG &DAG) const { 12074 SDValue N0 = Op.getOperand(0); 12075 SDLoc dl(Op); 12076 12077 if (Op.getValueType().isVector()) 12078 return lowerUINT_TO_FP_vec(Op, DAG); 12079 12080 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 12081 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 12082 // the optimization here. 12083 if (DAG.SignBitIsZero(N0)) 12084 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 12085 12086 MVT SrcVT = N0.getSimpleValueType(); 12087 MVT DstVT = Op.getSimpleValueType(); 12088 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 12089 return LowerUINT_TO_FP_i64(Op, DAG); 12090 if (SrcVT == MVT::i32 && X86ScalarSSEf64) 12091 return LowerUINT_TO_FP_i32(Op, DAG); 12092 if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) 12093 return SDValue(); 12094 12095 // Make a 64-bit buffer, and use it to build an FILD. 12096 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 12097 if (SrcVT == MVT::i32) { 12098 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 12099 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 12100 getPointerTy(), StackSlot, WordOff); 12101 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 12102 StackSlot, MachinePointerInfo(), 12103 false, false, 0); 12104 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 12105 OffsetSlot, MachinePointerInfo(), 12106 false, false, 0); 12107 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 12108 return Fild; 12109 } 12110 12111 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 12112 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 12113 StackSlot, MachinePointerInfo(), 12114 false, false, 0); 12115 // For i64 source, we need to add the appropriate power of 2 if the input 12116 // was negative. This is the same as the optimization in 12117 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 12118 // we must be careful to do the computation in x87 extended precision, not 12119 // in SSE. (The generic code can't know it's OK to do this, or how to.) 12120 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 12121 MachineMemOperand *MMO = 12122 DAG.getMachineFunction() 12123 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 12124 MachineMemOperand::MOLoad, 8, 8); 12125 12126 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 12127 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 12128 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 12129 MVT::i64, MMO); 12130 12131 APInt FF(32, 0x5F800000ULL); 12132 12133 // Check whether the sign bit is set. 12134 SDValue SignSet = DAG.getSetCC(dl, 12135 getSetCCResultType(*DAG.getContext(), MVT::i64), 12136 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 12137 ISD::SETLT); 12138 12139 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 12140 SDValue FudgePtr = DAG.getConstantPool( 12141 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 12142 getPointerTy()); 12143 12144 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 12145 SDValue Zero = DAG.getIntPtrConstant(0); 12146 SDValue Four = DAG.getIntPtrConstant(4); 12147 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 12148 Zero, Four); 12149 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 12150 12151 // Load the value out, extending it from f32 to f80. 12152 // FIXME: Avoid the extend by constructing the right constant pool? 12153 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 12154 FudgePtr, MachinePointerInfo::getConstantPool(), 12155 MVT::f32, false, false, false, 4); 12156 // Extend everything to 80 bits to force it to be done on x87. 12157 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 12158 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 12159 } 12160 12161 std::pair<SDValue,SDValue> 12162 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, 12163 bool IsSigned, bool IsReplace) const { 12164 SDLoc DL(Op); 12165 12166 EVT DstTy = Op.getValueType(); 12167 12168 if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { 12169 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 12170 DstTy = MVT::i64; 12171 } 12172 12173 assert(DstTy.getSimpleVT() <= MVT::i64 && 12174 DstTy.getSimpleVT() >= MVT::i16 && 12175 "Unknown FP_TO_INT to lower!"); 12176 12177 // These are really Legal. 12178 if (DstTy == MVT::i32 && 12179 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 12180 return std::make_pair(SDValue(), SDValue()); 12181 if (Subtarget->is64Bit() && 12182 DstTy == MVT::i64 && 12183 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 12184 return std::make_pair(SDValue(), SDValue()); 12185 12186 // We lower FP->int64 either into FISTP64 followed by a load from a temporary 12187 // stack slot, or into the FTOL runtime function. 12188 MachineFunction &MF = DAG.getMachineFunction(); 12189 unsigned MemSize = DstTy.getSizeInBits()/8; 12190 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 12191 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 12192 12193 unsigned Opc; 12194 if (!IsSigned && isIntegerTypeFTOL(DstTy)) 12195 Opc = X86ISD::WIN_FTOL; 12196 else 12197 switch (DstTy.getSimpleVT().SimpleTy) { 12198 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 12199 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 12200 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 12201 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 12202 } 12203 12204 SDValue Chain = DAG.getEntryNode(); 12205 SDValue Value = Op.getOperand(0); 12206 EVT TheVT = Op.getOperand(0).getValueType(); 12207 // FIXME This causes a redundant load/store if the SSE-class value is already 12208 // in memory, such as if it is on the callstack. 12209 if (isScalarFPTypeInSSEReg(TheVT)) { 12210 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 12211 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 12212 MachinePointerInfo::getFixedStack(SSFI), 12213 false, false, 0); 12214 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 12215 SDValue Ops[] = { 12216 Chain, StackSlot, DAG.getValueType(TheVT) 12217 }; 12218 12219 MachineMemOperand *MMO = 12220 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 12221 MachineMemOperand::MOLoad, MemSize, MemSize); 12222 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); 12223 Chain = Value.getValue(1); 12224 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 12225 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 12226 } 12227 12228 MachineMemOperand *MMO = 12229 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 12230 MachineMemOperand::MOStore, MemSize, MemSize); 12231 12232 if (Opc != X86ISD::WIN_FTOL) { 12233 // Build the FP_TO_INT*_IN_MEM 12234 SDValue Ops[] = { Chain, Value, StackSlot }; 12235 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 12236 Ops, DstTy, MMO); 12237 return std::make_pair(FIST, StackSlot); 12238 } else { 12239 SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, 12240 DAG.getVTList(MVT::Other, MVT::Glue), 12241 Chain, Value); 12242 SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, 12243 MVT::i32, ftol.getValue(1)); 12244 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, 12245 MVT::i32, eax.getValue(2)); 12246 SDValue Ops[] = { eax, edx }; 12247 SDValue pair = IsReplace 12248 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) 12249 : DAG.getMergeValues(Ops, DL); 12250 return std::make_pair(pair, SDValue()); 12251 } 12252 } 12253 12254 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, 12255 const X86Subtarget *Subtarget) { 12256 MVT VT = Op->getSimpleValueType(0); 12257 SDValue In = Op->getOperand(0); 12258 MVT InVT = In.getSimpleValueType(); 12259 SDLoc dl(Op); 12260 12261 // Optimize vectors in AVX mode: 12262 // 12263 // v8i16 -> v8i32 12264 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. 12265 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. 12266 // Concat upper and lower parts. 12267 // 12268 // v4i32 -> v4i64 12269 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. 12270 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. 12271 // Concat upper and lower parts. 12272 // 12273 12274 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && 12275 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && 12276 ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) 12277 return SDValue(); 12278 12279 if (Subtarget->hasInt256()) 12280 return DAG.getNode(X86ISD::VZEXT, dl, VT, In); 12281 12282 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); 12283 SDValue Undef = DAG.getUNDEF(InVT); 12284 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; 12285 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 12286 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); 12287 12288 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), 12289 VT.getVectorNumElements()/2); 12290 12291 OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); 12292 OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); 12293 12294 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 12295 } 12296 12297 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, 12298 SelectionDAG &DAG) { 12299 MVT VT = Op->getSimpleValueType(0); 12300 SDValue In = Op->getOperand(0); 12301 MVT InVT = In.getSimpleValueType(); 12302 SDLoc DL(Op); 12303 unsigned int NumElts = VT.getVectorNumElements(); 12304 if (NumElts != 8 && NumElts != 16) 12305 return SDValue(); 12306 12307 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 12308 return DAG.getNode(X86ISD::VZEXT, DL, VT, In); 12309 12310 EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; 12311 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12312 // Now we have only mask extension 12313 assert(InVT.getVectorElementType() == MVT::i1); 12314 SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); 12315 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 12316 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 12317 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 12318 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 12319 MachinePointerInfo::getConstantPool(), 12320 false, false, false, Alignment); 12321 12322 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); 12323 if (VT.is512BitVector()) 12324 return Brcst; 12325 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); 12326 } 12327 12328 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 12329 SelectionDAG &DAG) { 12330 if (Subtarget->hasFp256()) { 12331 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 12332 if (Res.getNode()) 12333 return Res; 12334 } 12335 12336 return SDValue(); 12337 } 12338 12339 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 12340 SelectionDAG &DAG) { 12341 SDLoc DL(Op); 12342 MVT VT = Op.getSimpleValueType(); 12343 SDValue In = Op.getOperand(0); 12344 MVT SVT = In.getSimpleValueType(); 12345 12346 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) 12347 return LowerZERO_EXTEND_AVX512(Op, DAG); 12348 12349 if (Subtarget->hasFp256()) { 12350 SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); 12351 if (Res.getNode()) 12352 return Res; 12353 } 12354 12355 assert(!VT.is256BitVector() || !SVT.is128BitVector() || 12356 VT.getVectorNumElements() != SVT.getVectorNumElements()); 12357 return SDValue(); 12358 } 12359 12360 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 12361 SDLoc DL(Op); 12362 MVT VT = Op.getSimpleValueType(); 12363 SDValue In = Op.getOperand(0); 12364 MVT InVT = In.getSimpleValueType(); 12365 12366 if (VT == MVT::i1) { 12367 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && 12368 "Invalid scalar TRUNCATE operation"); 12369 if (InVT.getSizeInBits() >= 32) 12370 return SDValue(); 12371 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); 12372 return DAG.getNode(ISD::TRUNCATE, DL, VT, In); 12373 } 12374 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && 12375 "Invalid TRUNCATE operation"); 12376 12377 if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { 12378 if (VT.getVectorElementType().getSizeInBits() >=8) 12379 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); 12380 12381 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 12382 unsigned NumElts = InVT.getVectorNumElements(); 12383 assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); 12384 if (InVT.getSizeInBits() < 512) { 12385 MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; 12386 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); 12387 InVT = ExtVT; 12388 } 12389 12390 SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); 12391 const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); 12392 SDValue CP = DAG.getConstantPool(C, getPointerTy()); 12393 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 12394 SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, 12395 MachinePointerInfo::getConstantPool(), 12396 false, false, false, Alignment); 12397 SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); 12398 SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); 12399 return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); 12400 } 12401 12402 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { 12403 // On AVX2, v4i64 -> v4i32 becomes VPERMD. 12404 if (Subtarget->hasInt256()) { 12405 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; 12406 In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In); 12407 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), 12408 ShufMask); 12409 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, 12410 DAG.getIntPtrConstant(0)); 12411 } 12412 12413 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 12414 DAG.getIntPtrConstant(0)); 12415 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 12416 DAG.getIntPtrConstant(2)); 12417 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 12418 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 12419 static const int ShufMask[] = {0, 2, 4, 6}; 12420 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); 12421 } 12422 12423 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { 12424 // On AVX2, v8i32 -> v8i16 becomed PSHUFB. 12425 if (Subtarget->hasInt256()) { 12426 In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); 12427 12428 SmallVector<SDValue,32> pshufbMask; 12429 for (unsigned i = 0; i < 2; ++i) { 12430 pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); 12431 pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); 12432 pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); 12433 pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); 12434 pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); 12435 pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); 12436 pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); 12437 pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); 12438 for (unsigned j = 0; j < 8; ++j) 12439 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 12440 } 12441 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); 12442 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); 12443 In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In); 12444 12445 static const int ShufMask[] = {0, 2, -1, -1}; 12446 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), 12447 &ShufMask[0]); 12448 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, 12449 DAG.getIntPtrConstant(0)); 12450 return DAG.getNode(ISD::BITCAST, DL, VT, In); 12451 } 12452 12453 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 12454 DAG.getIntPtrConstant(0)); 12455 12456 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, 12457 DAG.getIntPtrConstant(4)); 12458 12459 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); 12460 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); 12461 12462 // The PSHUFB mask: 12463 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, 12464 -1, -1, -1, -1, -1, -1, -1, -1}; 12465 12466 SDValue Undef = DAG.getUNDEF(MVT::v16i8); 12467 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); 12468 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); 12469 12470 OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); 12471 OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); 12472 12473 // The MOVLHPS Mask: 12474 static const int ShufMask2[] = {0, 1, 4, 5}; 12475 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); 12476 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res); 12477 } 12478 12479 // Handle truncation of V256 to V128 using shuffles. 12480 if (!VT.is128BitVector() || !InVT.is256BitVector()) 12481 return SDValue(); 12482 12483 assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); 12484 12485 unsigned NumElems = VT.getVectorNumElements(); 12486 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); 12487 12488 SmallVector<int, 16> MaskVec(NumElems * 2, -1); 12489 // Prepare truncation shuffle mask 12490 for (unsigned i = 0; i != NumElems; ++i) 12491 MaskVec[i] = i * 2; 12492 SDValue V = DAG.getVectorShuffle(NVT, DL, 12493 DAG.getNode(ISD::BITCAST, DL, NVT, In), 12494 DAG.getUNDEF(NVT), &MaskVec[0]); 12495 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, 12496 DAG.getIntPtrConstant(0)); 12497 } 12498 12499 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 12500 SelectionDAG &DAG) const { 12501 assert(!Op.getSimpleValueType().isVector()); 12502 12503 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 12504 /*IsSigned=*/ true, /*IsReplace=*/ false); 12505 SDValue FIST = Vals.first, StackSlot = Vals.second; 12506 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 12507 if (!FIST.getNode()) return Op; 12508 12509 if (StackSlot.getNode()) 12510 // Load the result. 12511 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 12512 FIST, StackSlot, MachinePointerInfo(), 12513 false, false, false, 0); 12514 12515 // The node is the result. 12516 return FIST; 12517 } 12518 12519 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 12520 SelectionDAG &DAG) const { 12521 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, 12522 /*IsSigned=*/ false, /*IsReplace=*/ false); 12523 SDValue FIST = Vals.first, StackSlot = Vals.second; 12524 assert(FIST.getNode() && "Unexpected failure"); 12525 12526 if (StackSlot.getNode()) 12527 // Load the result. 12528 return DAG.getLoad(Op.getValueType(), SDLoc(Op), 12529 FIST, StackSlot, MachinePointerInfo(), 12530 false, false, false, 0); 12531 12532 // The node is the result. 12533 return FIST; 12534 } 12535 12536 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { 12537 SDLoc DL(Op); 12538 MVT VT = Op.getSimpleValueType(); 12539 SDValue In = Op.getOperand(0); 12540 MVT SVT = In.getSimpleValueType(); 12541 12542 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); 12543 12544 return DAG.getNode(X86ISD::VFPEXT, DL, VT, 12545 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, 12546 In, DAG.getUNDEF(SVT))); 12547 } 12548 12549 // The only differences between FABS and FNEG are the mask and the logic op. 12550 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { 12551 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && 12552 "Wrong opcode for lowering FABS or FNEG."); 12553 12554 bool IsFABS = (Op.getOpcode() == ISD::FABS); 12555 SDLoc dl(Op); 12556 MVT VT = Op.getSimpleValueType(); 12557 // Assume scalar op for initialization; update for vector if needed. 12558 // Note that there are no scalar bitwise logical SSE/AVX instructions, so we 12559 // generate a 16-byte vector constant and logic op even for the scalar case. 12560 // Using a 16-byte mask allows folding the load of the mask with 12561 // the logic op, so it can save (~4 bytes) on code size. 12562 MVT EltVT = VT; 12563 unsigned NumElts = VT == MVT::f64 ? 2 : 4; 12564 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to 12565 // decide if we should generate a 16-byte constant mask when we only need 4 or 12566 // 8 bytes for the scalar case. 12567 if (VT.isVector()) { 12568 EltVT = VT.getVectorElementType(); 12569 NumElts = VT.getVectorNumElements(); 12570 } 12571 12572 unsigned EltBits = EltVT.getSizeInBits(); 12573 LLVMContext *Context = DAG.getContext(); 12574 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... 12575 APInt MaskElt = 12576 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); 12577 Constant *C = ConstantInt::get(*Context, MaskElt); 12578 C = ConstantVector::getSplat(NumElts, C); 12579 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12580 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); 12581 unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); 12582 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 12583 MachinePointerInfo::getConstantPool(), 12584 false, false, false, Alignment); 12585 12586 if (VT.isVector()) { 12587 // For a vector, cast operands to a vector type, perform the logic op, 12588 // and cast the result back to the original value type. 12589 MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); 12590 SDValue Op0Casted = DAG.getNode(ISD::BITCAST, dl, VecVT, Op.getOperand(0)); 12591 SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); 12592 unsigned LogicOp = IsFABS ? ISD::AND : ISD::XOR; 12593 return DAG.getNode(ISD::BITCAST, dl, VT, 12594 DAG.getNode(LogicOp, dl, VecVT, Op0Casted, MaskCasted)); 12595 } 12596 // If not vector, then scalar. 12597 unsigned LogicOp = IsFABS ? X86ISD::FAND : X86ISD::FXOR; 12598 return DAG.getNode(LogicOp, dl, VT, Op.getOperand(0), Mask); 12599 } 12600 12601 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 12602 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12603 LLVMContext *Context = DAG.getContext(); 12604 SDValue Op0 = Op.getOperand(0); 12605 SDValue Op1 = Op.getOperand(1); 12606 SDLoc dl(Op); 12607 MVT VT = Op.getSimpleValueType(); 12608 MVT SrcVT = Op1.getSimpleValueType(); 12609 12610 // If second operand is smaller, extend it first. 12611 if (SrcVT.bitsLT(VT)) { 12612 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 12613 SrcVT = VT; 12614 } 12615 // And if it is bigger, shrink it first. 12616 if (SrcVT.bitsGT(VT)) { 12617 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 12618 SrcVT = VT; 12619 } 12620 12621 // At this point the operands and the result should have the same 12622 // type, and that won't be f80 since that is not custom lowered. 12623 12624 // First get the sign bit of second operand. 12625 SmallVector<Constant*,4> CV; 12626 if (SrcVT == MVT::f64) { 12627 const fltSemantics &Sem = APFloat::IEEEdouble; 12628 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); 12629 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 12630 } else { 12631 const fltSemantics &Sem = APFloat::IEEEsingle; 12632 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); 12633 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 12634 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 12635 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 12636 } 12637 Constant *C = ConstantVector::get(CV); 12638 SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); 12639 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 12640 MachinePointerInfo::getConstantPool(), 12641 false, false, false, 16); 12642 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 12643 12644 // Shift sign bit right or left if the two operands have different types. 12645 if (SrcVT.bitsGT(VT)) { 12646 // Op0 is MVT::f32, Op1 is MVT::f64. 12647 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 12648 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 12649 DAG.getConstant(32, MVT::i32)); 12650 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 12651 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 12652 DAG.getIntPtrConstant(0)); 12653 } 12654 12655 // Clear first operand sign bit. 12656 CV.clear(); 12657 if (VT == MVT::f64) { 12658 const fltSemantics &Sem = APFloat::IEEEdouble; 12659 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 12660 APInt(64, ~(1ULL << 63))))); 12661 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); 12662 } else { 12663 const fltSemantics &Sem = APFloat::IEEEsingle; 12664 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, 12665 APInt(32, ~(1U << 31))))); 12666 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 12667 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 12668 CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); 12669 } 12670 C = ConstantVector::get(CV); 12671 CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); 12672 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 12673 MachinePointerInfo::getConstantPool(), 12674 false, false, false, 16); 12675 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 12676 12677 // Or the value with the sign bit. 12678 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 12679 } 12680 12681 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { 12682 SDValue N0 = Op.getOperand(0); 12683 SDLoc dl(Op); 12684 MVT VT = Op.getSimpleValueType(); 12685 12686 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 12687 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 12688 DAG.getConstant(1, VT)); 12689 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 12690 } 12691 12692 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. 12693 // 12694 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, 12695 SelectionDAG &DAG) { 12696 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); 12697 12698 if (!Subtarget->hasSSE41()) 12699 return SDValue(); 12700 12701 if (!Op->hasOneUse()) 12702 return SDValue(); 12703 12704 SDNode *N = Op.getNode(); 12705 SDLoc DL(N); 12706 12707 SmallVector<SDValue, 8> Opnds; 12708 DenseMap<SDValue, unsigned> VecInMap; 12709 SmallVector<SDValue, 8> VecIns; 12710 EVT VT = MVT::Other; 12711 12712 // Recognize a special case where a vector is casted into wide integer to 12713 // test all 0s. 12714 Opnds.push_back(N->getOperand(0)); 12715 Opnds.push_back(N->getOperand(1)); 12716 12717 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { 12718 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; 12719 // BFS traverse all OR'd operands. 12720 if (I->getOpcode() == ISD::OR) { 12721 Opnds.push_back(I->getOperand(0)); 12722 Opnds.push_back(I->getOperand(1)); 12723 // Re-evaluate the number of nodes to be traversed. 12724 e += 2; // 2 more nodes (LHS and RHS) are pushed. 12725 continue; 12726 } 12727 12728 // Quit if a non-EXTRACT_VECTOR_ELT 12729 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12730 return SDValue(); 12731 12732 // Quit if without a constant index. 12733 SDValue Idx = I->getOperand(1); 12734 if (!isa<ConstantSDNode>(Idx)) 12735 return SDValue(); 12736 12737 SDValue ExtractedFromVec = I->getOperand(0); 12738 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); 12739 if (M == VecInMap.end()) { 12740 VT = ExtractedFromVec.getValueType(); 12741 // Quit if not 128/256-bit vector. 12742 if (!VT.is128BitVector() && !VT.is256BitVector()) 12743 return SDValue(); 12744 // Quit if not the same type. 12745 if (VecInMap.begin() != VecInMap.end() && 12746 VT != VecInMap.begin()->first.getValueType()) 12747 return SDValue(); 12748 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; 12749 VecIns.push_back(ExtractedFromVec); 12750 } 12751 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); 12752 } 12753 12754 assert((VT.is128BitVector() || VT.is256BitVector()) && 12755 "Not extracted from 128-/256-bit vector."); 12756 12757 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; 12758 12759 for (DenseMap<SDValue, unsigned>::const_iterator 12760 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { 12761 // Quit if not all elements are used. 12762 if (I->second != FullMask) 12763 return SDValue(); 12764 } 12765 12766 EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; 12767 12768 // Cast all vectors into TestVT for PTEST. 12769 for (unsigned i = 0, e = VecIns.size(); i < e; ++i) 12770 VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); 12771 12772 // If more than one full vectors are evaluated, OR them first before PTEST. 12773 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { 12774 // Each iteration will OR 2 nodes and append the result until there is only 12775 // 1 node left, i.e. the final OR'd value of all vectors. 12776 SDValue LHS = VecIns[Slot]; 12777 SDValue RHS = VecIns[Slot + 1]; 12778 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); 12779 } 12780 12781 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, 12782 VecIns.back(), VecIns.back()); 12783 } 12784 12785 /// \brief return true if \c Op has a use that doesn't just read flags. 12786 static bool hasNonFlagsUse(SDValue Op) { 12787 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; 12788 ++UI) { 12789 SDNode *User = *UI; 12790 unsigned UOpNo = UI.getOperandNo(); 12791 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 12792 // Look pass truncate. 12793 UOpNo = User->use_begin().getOperandNo(); 12794 User = *User->use_begin(); 12795 } 12796 12797 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && 12798 !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) 12799 return true; 12800 } 12801 return false; 12802 } 12803 12804 /// Emit nodes that will be selected as "test Op0,Op0", or something 12805 /// equivalent. 12806 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, 12807 SelectionDAG &DAG) const { 12808 if (Op.getValueType() == MVT::i1) 12809 // KORTEST instruction should be selected 12810 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 12811 DAG.getConstant(0, Op.getValueType())); 12812 12813 // CF and OF aren't always set the way we want. Determine which 12814 // of these we need. 12815 bool NeedCF = false; 12816 bool NeedOF = false; 12817 switch (X86CC) { 12818 default: break; 12819 case X86::COND_A: case X86::COND_AE: 12820 case X86::COND_B: case X86::COND_BE: 12821 NeedCF = true; 12822 break; 12823 case X86::COND_G: case X86::COND_GE: 12824 case X86::COND_L: case X86::COND_LE: 12825 case X86::COND_O: case X86::COND_NO: { 12826 // Check if we really need to set the 12827 // Overflow flag. If NoSignedWrap is present 12828 // that is not actually needed. 12829 switch (Op->getOpcode()) { 12830 case ISD::ADD: 12831 case ISD::SUB: 12832 case ISD::MUL: 12833 case ISD::SHL: { 12834 const BinaryWithFlagsSDNode *BinNode = 12835 cast<BinaryWithFlagsSDNode>(Op.getNode()); 12836 if (BinNode->hasNoSignedWrap()) 12837 break; 12838 } 12839 default: 12840 NeedOF = true; 12841 break; 12842 } 12843 break; 12844 } 12845 } 12846 // See if we can use the EFLAGS value from the operand instead of 12847 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 12848 // we prove that the arithmetic won't overflow, we can't use OF or CF. 12849 if (Op.getResNo() != 0 || NeedOF || NeedCF) { 12850 // Emit a CMP with 0, which is the TEST pattern. 12851 //if (Op.getValueType() == MVT::i1) 12852 // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, 12853 // DAG.getConstant(0, MVT::i1)); 12854 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 12855 DAG.getConstant(0, Op.getValueType())); 12856 } 12857 unsigned Opcode = 0; 12858 unsigned NumOperands = 0; 12859 12860 // Truncate operations may prevent the merge of the SETCC instruction 12861 // and the arithmetic instruction before it. Attempt to truncate the operands 12862 // of the arithmetic instruction and use a reduced bit-width instruction. 12863 bool NeedTruncation = false; 12864 SDValue ArithOp = Op; 12865 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { 12866 SDValue Arith = Op->getOperand(0); 12867 // Both the trunc and the arithmetic op need to have one user each. 12868 if (Arith->hasOneUse()) 12869 switch (Arith.getOpcode()) { 12870 default: break; 12871 case ISD::ADD: 12872 case ISD::SUB: 12873 case ISD::AND: 12874 case ISD::OR: 12875 case ISD::XOR: { 12876 NeedTruncation = true; 12877 ArithOp = Arith; 12878 } 12879 } 12880 } 12881 12882 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation 12883 // which may be the result of a CAST. We use the variable 'Op', which is the 12884 // non-casted variable when we check for possible users. 12885 switch (ArithOp.getOpcode()) { 12886 case ISD::ADD: 12887 // Due to an isel shortcoming, be conservative if this add is likely to be 12888 // selected as part of a load-modify-store instruction. When the root node 12889 // in a match is a store, isel doesn't know how to remap non-chain non-flag 12890 // uses of other nodes in the match, such as the ADD in this case. This 12891 // leads to the ADD being left around and reselected, with the result being 12892 // two adds in the output. Alas, even if none our users are stores, that 12893 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 12894 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 12895 // climbing the DAG back to the root, and it doesn't seem to be worth the 12896 // effort. 12897 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12898 UE = Op.getNode()->use_end(); UI != UE; ++UI) 12899 if (UI->getOpcode() != ISD::CopyToReg && 12900 UI->getOpcode() != ISD::SETCC && 12901 UI->getOpcode() != ISD::STORE) 12902 goto default_case; 12903 12904 if (ConstantSDNode *C = 12905 dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { 12906 // An add of one will be selected as an INC. 12907 if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { 12908 Opcode = X86ISD::INC; 12909 NumOperands = 1; 12910 break; 12911 } 12912 12913 // An add of negative one (subtract of one) will be selected as a DEC. 12914 if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { 12915 Opcode = X86ISD::DEC; 12916 NumOperands = 1; 12917 break; 12918 } 12919 } 12920 12921 // Otherwise use a regular EFLAGS-setting add. 12922 Opcode = X86ISD::ADD; 12923 NumOperands = 2; 12924 break; 12925 case ISD::SHL: 12926 case ISD::SRL: 12927 // If we have a constant logical shift that's only used in a comparison 12928 // against zero turn it into an equivalent AND. This allows turning it into 12929 // a TEST instruction later. 12930 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && 12931 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { 12932 EVT VT = Op.getValueType(); 12933 unsigned BitWidth = VT.getSizeInBits(); 12934 unsigned ShAmt = Op->getConstantOperandVal(1); 12935 if (ShAmt >= BitWidth) // Avoid undefined shifts. 12936 break; 12937 APInt Mask = ArithOp.getOpcode() == ISD::SRL 12938 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) 12939 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); 12940 if (!Mask.isSignedIntN(32)) // Avoid large immediates. 12941 break; 12942 SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), 12943 DAG.getConstant(Mask, VT)); 12944 DAG.ReplaceAllUsesWith(Op, New); 12945 Op = New; 12946 } 12947 break; 12948 12949 case ISD::AND: 12950 // If the primary and result isn't used, don't bother using X86ISD::AND, 12951 // because a TEST instruction will be better. 12952 if (!hasNonFlagsUse(Op)) 12953 break; 12954 // FALL THROUGH 12955 case ISD::SUB: 12956 case ISD::OR: 12957 case ISD::XOR: 12958 // Due to the ISEL shortcoming noted above, be conservative if this op is 12959 // likely to be selected as part of a load-modify-store instruction. 12960 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12961 UE = Op.getNode()->use_end(); UI != UE; ++UI) 12962 if (UI->getOpcode() == ISD::STORE) 12963 goto default_case; 12964 12965 // Otherwise use a regular EFLAGS-setting instruction. 12966 switch (ArithOp.getOpcode()) { 12967 default: llvm_unreachable("unexpected operator!"); 12968 case ISD::SUB: Opcode = X86ISD::SUB; break; 12969 case ISD::XOR: Opcode = X86ISD::XOR; break; 12970 case ISD::AND: Opcode = X86ISD::AND; break; 12971 case ISD::OR: { 12972 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { 12973 SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); 12974 if (EFLAGS.getNode()) 12975 return EFLAGS; 12976 } 12977 Opcode = X86ISD::OR; 12978 break; 12979 } 12980 } 12981 12982 NumOperands = 2; 12983 break; 12984 case X86ISD::ADD: 12985 case X86ISD::SUB: 12986 case X86ISD::INC: 12987 case X86ISD::DEC: 12988 case X86ISD::OR: 12989 case X86ISD::XOR: 12990 case X86ISD::AND: 12991 return SDValue(Op.getNode(), 1); 12992 default: 12993 default_case: 12994 break; 12995 } 12996 12997 // If we found that truncation is beneficial, perform the truncation and 12998 // update 'Op'. 12999 if (NeedTruncation) { 13000 EVT VT = Op.getValueType(); 13001 SDValue WideVal = Op->getOperand(0); 13002 EVT WideVT = WideVal.getValueType(); 13003 unsigned ConvertedOp = 0; 13004 // Use a target machine opcode to prevent further DAGCombine 13005 // optimizations that may separate the arithmetic operations 13006 // from the setcc node. 13007 switch (WideVal.getOpcode()) { 13008 default: break; 13009 case ISD::ADD: ConvertedOp = X86ISD::ADD; break; 13010 case ISD::SUB: ConvertedOp = X86ISD::SUB; break; 13011 case ISD::AND: ConvertedOp = X86ISD::AND; break; 13012 case ISD::OR: ConvertedOp = X86ISD::OR; break; 13013 case ISD::XOR: ConvertedOp = X86ISD::XOR; break; 13014 } 13015 13016 if (ConvertedOp) { 13017 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13018 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { 13019 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); 13020 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); 13021 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); 13022 } 13023 } 13024 } 13025 13026 if (Opcode == 0) 13027 // Emit a CMP with 0, which is the TEST pattern. 13028 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 13029 DAG.getConstant(0, Op.getValueType())); 13030 13031 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 13032 SmallVector<SDValue, 4> Ops; 13033 for (unsigned i = 0; i != NumOperands; ++i) 13034 Ops.push_back(Op.getOperand(i)); 13035 13036 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); 13037 DAG.ReplaceAllUsesWith(Op, New); 13038 return SDValue(New.getNode(), 1); 13039 } 13040 13041 /// Emit nodes that will be selected as "cmp Op0,Op1", or something 13042 /// equivalent. 13043 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 13044 SDLoc dl, SelectionDAG &DAG) const { 13045 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { 13046 if (C->getAPIntValue() == 0) 13047 return EmitTest(Op0, X86CC, dl, DAG); 13048 13049 if (Op0.getValueType() == MVT::i1) 13050 llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); 13051 } 13052 13053 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || 13054 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { 13055 // Do the comparison at i32 if it's smaller, besides the Atom case. 13056 // This avoids subregister aliasing issues. Keep the smaller reference 13057 // if we're optimizing for size, however, as that'll allow better folding 13058 // of memory operations. 13059 if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && 13060 !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( 13061 AttributeSet::FunctionIndex, Attribute::MinSize) && 13062 !Subtarget->isAtom()) { 13063 unsigned ExtendOp = 13064 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; 13065 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); 13066 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); 13067 } 13068 // Use SUB instead of CMP to enable CSE between SUB and CMP. 13069 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); 13070 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, 13071 Op0, Op1); 13072 return SDValue(Sub.getNode(), 1); 13073 } 13074 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 13075 } 13076 13077 /// Convert a comparison if required by the subtarget. 13078 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, 13079 SelectionDAG &DAG) const { 13080 // If the subtarget does not support the FUCOMI instruction, floating-point 13081 // comparisons have to be converted. 13082 if (Subtarget->hasCMov() || 13083 Cmp.getOpcode() != X86ISD::CMP || 13084 !Cmp.getOperand(0).getValueType().isFloatingPoint() || 13085 !Cmp.getOperand(1).getValueType().isFloatingPoint()) 13086 return Cmp; 13087 13088 // The instruction selector will select an FUCOM instruction instead of 13089 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence 13090 // build an SDNode sequence that transfers the result from FPSW into EFLAGS: 13091 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) 13092 SDLoc dl(Cmp); 13093 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); 13094 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); 13095 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, 13096 DAG.getConstant(8, MVT::i8)); 13097 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); 13098 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); 13099 } 13100 13101 static bool isAllOnes(SDValue V) { 13102 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 13103 return C && C->isAllOnesValue(); 13104 } 13105 13106 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 13107 /// if it's possible. 13108 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 13109 SDLoc dl, SelectionDAG &DAG) const { 13110 SDValue Op0 = And.getOperand(0); 13111 SDValue Op1 = And.getOperand(1); 13112 if (Op0.getOpcode() == ISD::TRUNCATE) 13113 Op0 = Op0.getOperand(0); 13114 if (Op1.getOpcode() == ISD::TRUNCATE) 13115 Op1 = Op1.getOperand(0); 13116 13117 SDValue LHS, RHS; 13118 if (Op1.getOpcode() == ISD::SHL) 13119 std::swap(Op0, Op1); 13120 if (Op0.getOpcode() == ISD::SHL) { 13121 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 13122 if (And00C->getZExtValue() == 1) { 13123 // If we looked past a truncate, check that it's only truncating away 13124 // known zeros. 13125 unsigned BitWidth = Op0.getValueSizeInBits(); 13126 unsigned AndBitWidth = And.getValueSizeInBits(); 13127 if (BitWidth > AndBitWidth) { 13128 APInt Zeros, Ones; 13129 DAG.computeKnownBits(Op0, Zeros, Ones); 13130 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 13131 return SDValue(); 13132 } 13133 LHS = Op1; 13134 RHS = Op0.getOperand(1); 13135 } 13136 } else if (Op1.getOpcode() == ISD::Constant) { 13137 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 13138 uint64_t AndRHSVal = AndRHS->getZExtValue(); 13139 SDValue AndLHS = Op0; 13140 13141 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { 13142 LHS = AndLHS.getOperand(0); 13143 RHS = AndLHS.getOperand(1); 13144 } 13145 13146 // Use BT if the immediate can't be encoded in a TEST instruction. 13147 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { 13148 LHS = AndLHS; 13149 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); 13150 } 13151 } 13152 13153 if (LHS.getNode()) { 13154 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 13155 // instruction. Since the shift amount is in-range-or-undefined, we know 13156 // that doing a bittest on the i32 value is ok. We extend to i32 because 13157 // the encoding for the i16 version is larger than the i32 version. 13158 // Also promote i16 to i32 for performance / code size reason. 13159 if (LHS.getValueType() == MVT::i8 || 13160 LHS.getValueType() == MVT::i16) 13161 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 13162 13163 // If the operand types disagree, extend the shift amount to match. Since 13164 // BT ignores high bits (like shifts) we can use anyextend. 13165 if (LHS.getValueType() != RHS.getValueType()) 13166 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 13167 13168 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 13169 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 13170 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 13171 DAG.getConstant(Cond, MVT::i8), BT); 13172 } 13173 13174 return SDValue(); 13175 } 13176 13177 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point 13178 /// mask CMPs. 13179 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, 13180 SDValue &Op1) { 13181 unsigned SSECC; 13182 bool Swap = false; 13183 13184 // SSE Condition code mapping: 13185 // 0 - EQ 13186 // 1 - LT 13187 // 2 - LE 13188 // 3 - UNORD 13189 // 4 - NEQ 13190 // 5 - NLT 13191 // 6 - NLE 13192 // 7 - ORD 13193 switch (SetCCOpcode) { 13194 default: llvm_unreachable("Unexpected SETCC condition"); 13195 case ISD::SETOEQ: 13196 case ISD::SETEQ: SSECC = 0; break; 13197 case ISD::SETOGT: 13198 case ISD::SETGT: Swap = true; // Fallthrough 13199 case ISD::SETLT: 13200 case ISD::SETOLT: SSECC = 1; break; 13201 case ISD::SETOGE: 13202 case ISD::SETGE: Swap = true; // Fallthrough 13203 case ISD::SETLE: 13204 case ISD::SETOLE: SSECC = 2; break; 13205 case ISD::SETUO: SSECC = 3; break; 13206 case ISD::SETUNE: 13207 case ISD::SETNE: SSECC = 4; break; 13208 case ISD::SETULE: Swap = true; // Fallthrough 13209 case ISD::SETUGE: SSECC = 5; break; 13210 case ISD::SETULT: Swap = true; // Fallthrough 13211 case ISD::SETUGT: SSECC = 6; break; 13212 case ISD::SETO: SSECC = 7; break; 13213 case ISD::SETUEQ: 13214 case ISD::SETONE: SSECC = 8; break; 13215 } 13216 if (Swap) 13217 std::swap(Op0, Op1); 13218 13219 return SSECC; 13220 } 13221 13222 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 13223 // ones, and then concatenate the result back. 13224 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { 13225 MVT VT = Op.getSimpleValueType(); 13226 13227 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && 13228 "Unsupported value type for operation"); 13229 13230 unsigned NumElems = VT.getVectorNumElements(); 13231 SDLoc dl(Op); 13232 SDValue CC = Op.getOperand(2); 13233 13234 // Extract the LHS vectors 13235 SDValue LHS = Op.getOperand(0); 13236 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 13237 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 13238 13239 // Extract the RHS vectors 13240 SDValue RHS = Op.getOperand(1); 13241 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 13242 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 13243 13244 // Issue the operation on the smaller types and concatenate the result back 13245 MVT EltVT = VT.getVectorElementType(); 13246 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 13247 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 13248 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), 13249 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); 13250 } 13251 13252 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, 13253 const X86Subtarget *Subtarget) { 13254 SDValue Op0 = Op.getOperand(0); 13255 SDValue Op1 = Op.getOperand(1); 13256 SDValue CC = Op.getOperand(2); 13257 MVT VT = Op.getSimpleValueType(); 13258 SDLoc dl(Op); 13259 13260 assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && 13261 Op.getValueType().getScalarType() == MVT::i1 && 13262 "Cannot set masked compare for this operation"); 13263 13264 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 13265 unsigned Opc = 0; 13266 bool Unsigned = false; 13267 bool Swap = false; 13268 unsigned SSECC; 13269 switch (SetCCOpcode) { 13270 default: llvm_unreachable("Unexpected SETCC condition"); 13271 case ISD::SETNE: SSECC = 4; break; 13272 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; 13273 case ISD::SETUGT: SSECC = 6; Unsigned = true; break; 13274 case ISD::SETLT: Swap = true; //fall-through 13275 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; 13276 case ISD::SETULT: SSECC = 1; Unsigned = true; break; 13277 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT 13278 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap 13279 case ISD::SETULE: Unsigned = true; //fall-through 13280 case ISD::SETLE: SSECC = 2; break; 13281 } 13282 13283 if (Swap) 13284 std::swap(Op0, Op1); 13285 if (Opc) 13286 return DAG.getNode(Opc, dl, VT, Op0, Op1); 13287 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; 13288 return DAG.getNode(Opc, dl, VT, Op0, Op1, 13289 DAG.getConstant(SSECC, MVT::i8)); 13290 } 13291 13292 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second 13293 /// operand \p Op1. If non-trivial (for example because it's not constant) 13294 /// return an empty value. 13295 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) 13296 { 13297 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); 13298 if (!BV) 13299 return SDValue(); 13300 13301 MVT VT = Op1.getSimpleValueType(); 13302 MVT EVT = VT.getVectorElementType(); 13303 unsigned n = VT.getVectorNumElements(); 13304 SmallVector<SDValue, 8> ULTOp1; 13305 13306 for (unsigned i = 0; i < n; ++i) { 13307 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); 13308 if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) 13309 return SDValue(); 13310 13311 // Avoid underflow. 13312 APInt Val = Elt->getAPIntValue(); 13313 if (Val == 0) 13314 return SDValue(); 13315 13316 ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); 13317 } 13318 13319 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); 13320 } 13321 13322 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, 13323 SelectionDAG &DAG) { 13324 SDValue Op0 = Op.getOperand(0); 13325 SDValue Op1 = Op.getOperand(1); 13326 SDValue CC = Op.getOperand(2); 13327 MVT VT = Op.getSimpleValueType(); 13328 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 13329 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); 13330 SDLoc dl(Op); 13331 13332 if (isFP) { 13333 #ifndef NDEBUG 13334 MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); 13335 assert(EltVT == MVT::f32 || EltVT == MVT::f64); 13336 #endif 13337 13338 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); 13339 unsigned Opc = X86ISD::CMPP; 13340 if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { 13341 assert(VT.getVectorNumElements() <= 16); 13342 Opc = X86ISD::CMPM; 13343 } 13344 // In the two special cases we can't handle, emit two comparisons. 13345 if (SSECC == 8) { 13346 unsigned CC0, CC1; 13347 unsigned CombineOpc; 13348 if (SetCCOpcode == ISD::SETUEQ) { 13349 CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; 13350 } else { 13351 assert(SetCCOpcode == ISD::SETONE); 13352 CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; 13353 } 13354 13355 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, 13356 DAG.getConstant(CC0, MVT::i8)); 13357 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, 13358 DAG.getConstant(CC1, MVT::i8)); 13359 return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); 13360 } 13361 // Handle all other FP comparisons here. 13362 return DAG.getNode(Opc, dl, VT, Op0, Op1, 13363 DAG.getConstant(SSECC, MVT::i8)); 13364 } 13365 13366 // Break 256-bit integer vector compare into smaller ones. 13367 if (VT.is256BitVector() && !Subtarget->hasInt256()) 13368 return Lower256IntVSETCC(Op, DAG); 13369 13370 bool MaskResult = (VT.getVectorElementType() == MVT::i1); 13371 EVT OpVT = Op1.getValueType(); 13372 if (Subtarget->hasAVX512()) { 13373 if (Op1.getValueType().is512BitVector() || 13374 (Subtarget->hasBWI() && Subtarget->hasVLX()) || 13375 (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) 13376 return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); 13377 13378 // In AVX-512 architecture setcc returns mask with i1 elements, 13379 // But there is no compare instruction for i8 and i16 elements in KNL. 13380 // We are not talking about 512-bit operands in this case, these 13381 // types are illegal. 13382 if (MaskResult && 13383 (OpVT.getVectorElementType().getSizeInBits() < 32 && 13384 OpVT.getVectorElementType().getSizeInBits() >= 8)) 13385 return DAG.getNode(ISD::TRUNCATE, dl, VT, 13386 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); 13387 } 13388 13389 // We are handling one of the integer comparisons here. Since SSE only has 13390 // GT and EQ comparisons for integer, swapping operands and multiple 13391 // operations may be required for some comparisons. 13392 unsigned Opc; 13393 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; 13394 bool Subus = false; 13395 13396 switch (SetCCOpcode) { 13397 default: llvm_unreachable("Unexpected SETCC condition"); 13398 case ISD::SETNE: Invert = true; 13399 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; 13400 case ISD::SETLT: Swap = true; 13401 case ISD::SETGT: Opc = X86ISD::PCMPGT; break; 13402 case ISD::SETGE: Swap = true; 13403 case ISD::SETLE: Opc = X86ISD::PCMPGT; 13404 Invert = true; break; 13405 case ISD::SETULT: Swap = true; 13406 case ISD::SETUGT: Opc = X86ISD::PCMPGT; 13407 FlipSigns = true; break; 13408 case ISD::SETUGE: Swap = true; 13409 case ISD::SETULE: Opc = X86ISD::PCMPGT; 13410 FlipSigns = true; Invert = true; break; 13411 } 13412 13413 // Special case: Use min/max operations for SETULE/SETUGE 13414 MVT VET = VT.getVectorElementType(); 13415 bool hasMinMax = 13416 (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) 13417 || (Subtarget->hasSSE2() && (VET == MVT::i8)); 13418 13419 if (hasMinMax) { 13420 switch (SetCCOpcode) { 13421 default: break; 13422 case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; 13423 case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; 13424 } 13425 13426 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } 13427 } 13428 13429 bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); 13430 if (!MinMax && hasSubus) { 13431 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for 13432 // Op0 u<= Op1: 13433 // t = psubus Op0, Op1 13434 // pcmpeq t, <0..0> 13435 switch (SetCCOpcode) { 13436 default: break; 13437 case ISD::SETULT: { 13438 // If the comparison is against a constant we can turn this into a 13439 // setule. With psubus, setule does not require a swap. This is 13440 // beneficial because the constant in the register is no longer 13441 // destructed as the destination so it can be hoisted out of a loop. 13442 // Only do this pre-AVX since vpcmp* is no longer destructive. 13443 if (Subtarget->hasAVX()) 13444 break; 13445 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); 13446 if (ULEOp1.getNode()) { 13447 Op1 = ULEOp1; 13448 Subus = true; Invert = false; Swap = false; 13449 } 13450 break; 13451 } 13452 // Psubus is better than flip-sign because it requires no inversion. 13453 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; 13454 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; 13455 } 13456 13457 if (Subus) { 13458 Opc = X86ISD::SUBUS; 13459 FlipSigns = false; 13460 } 13461 } 13462 13463 if (Swap) 13464 std::swap(Op0, Op1); 13465 13466 // Check that the operation in question is available (most are plain SSE2, 13467 // but PCMPGTQ and PCMPEQQ have different requirements). 13468 if (VT == MVT::v2i64) { 13469 if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { 13470 assert(Subtarget->hasSSE2() && "Don't know how to lower!"); 13471 13472 // First cast everything to the right type. 13473 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 13474 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 13475 13476 // Since SSE has no unsigned integer comparisons, we need to flip the sign 13477 // bits of the inputs before performing those operations. The lower 13478 // compare is always unsigned. 13479 SDValue SB; 13480 if (FlipSigns) { 13481 SB = DAG.getConstant(0x80000000U, MVT::v4i32); 13482 } else { 13483 SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); 13484 SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); 13485 SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 13486 Sign, Zero, Sign, Zero); 13487 } 13488 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); 13489 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); 13490 13491 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) 13492 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); 13493 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); 13494 13495 // Create masks for only the low parts/high parts of the 64 bit integers. 13496 static const int MaskHi[] = { 1, 1, 3, 3 }; 13497 static const int MaskLo[] = { 0, 0, 2, 2 }; 13498 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); 13499 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); 13500 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); 13501 13502 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); 13503 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); 13504 13505 if (Invert) 13506 Result = DAG.getNOT(dl, Result, MVT::v4i32); 13507 13508 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 13509 } 13510 13511 if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { 13512 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with 13513 // pcmpeqd + pshufd + pand. 13514 assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); 13515 13516 // First cast everything to the right type. 13517 Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); 13518 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); 13519 13520 // Do the compare. 13521 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); 13522 13523 // Make sure the lower and upper halves are both all-ones. 13524 static const int Mask[] = { 1, 0, 3, 2 }; 13525 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); 13526 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); 13527 13528 if (Invert) 13529 Result = DAG.getNOT(dl, Result, MVT::v4i32); 13530 13531 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 13532 } 13533 } 13534 13535 // Since SSE has no unsigned integer comparisons, we need to flip the sign 13536 // bits of the inputs before performing those operations. 13537 if (FlipSigns) { 13538 EVT EltVT = VT.getVectorElementType(); 13539 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); 13540 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); 13541 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); 13542 } 13543 13544 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 13545 13546 // If the logical-not of the result is required, perform that now. 13547 if (Invert) 13548 Result = DAG.getNOT(dl, Result, VT); 13549 13550 if (MinMax) 13551 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); 13552 13553 if (Subus) 13554 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, 13555 getZeroVector(VT, Subtarget, DAG, dl)); 13556 13557 return Result; 13558 } 13559 13560 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 13561 13562 MVT VT = Op.getSimpleValueType(); 13563 13564 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); 13565 13566 assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) 13567 && "SetCC type must be 8-bit or 1-bit integer"); 13568 SDValue Op0 = Op.getOperand(0); 13569 SDValue Op1 = Op.getOperand(1); 13570 SDLoc dl(Op); 13571 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 13572 13573 // Optimize to BT if possible. 13574 // Lower (X & (1 << N)) == 0 to BT(X, N). 13575 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 13576 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 13577 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 13578 Op1.getOpcode() == ISD::Constant && 13579 cast<ConstantSDNode>(Op1)->isNullValue() && 13580 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 13581 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 13582 if (NewSetCC.getNode()) 13583 return NewSetCC; 13584 } 13585 13586 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 13587 // these. 13588 if (Op1.getOpcode() == ISD::Constant && 13589 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 13590 cast<ConstantSDNode>(Op1)->isNullValue()) && 13591 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 13592 13593 // If the input is a setcc, then reuse the input setcc or use a new one with 13594 // the inverted condition. 13595 if (Op0.getOpcode() == X86ISD::SETCC) { 13596 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 13597 bool Invert = (CC == ISD::SETNE) ^ 13598 cast<ConstantSDNode>(Op1)->isNullValue(); 13599 if (!Invert) 13600 return Op0; 13601 13602 CCode = X86::GetOppositeBranchCondition(CCode); 13603 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 13604 DAG.getConstant(CCode, MVT::i8), 13605 Op0.getOperand(1)); 13606 if (VT == MVT::i1) 13607 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 13608 return SetCC; 13609 } 13610 } 13611 if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && 13612 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) && 13613 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 13614 13615 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); 13616 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC); 13617 } 13618 13619 bool isFP = Op1.getSimpleValueType().isFloatingPoint(); 13620 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 13621 if (X86CC == X86::COND_INVALID) 13622 return SDValue(); 13623 13624 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); 13625 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); 13626 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 13627 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 13628 if (VT == MVT::i1) 13629 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); 13630 return SetCC; 13631 } 13632 13633 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 13634 static bool isX86LogicalCmp(SDValue Op) { 13635 unsigned Opc = Op.getNode()->getOpcode(); 13636 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || 13637 Opc == X86ISD::SAHF) 13638 return true; 13639 if (Op.getResNo() == 1 && 13640 (Opc == X86ISD::ADD || 13641 Opc == X86ISD::SUB || 13642 Opc == X86ISD::ADC || 13643 Opc == X86ISD::SBB || 13644 Opc == X86ISD::SMUL || 13645 Opc == X86ISD::UMUL || 13646 Opc == X86ISD::INC || 13647 Opc == X86ISD::DEC || 13648 Opc == X86ISD::OR || 13649 Opc == X86ISD::XOR || 13650 Opc == X86ISD::AND)) 13651 return true; 13652 13653 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 13654 return true; 13655 13656 return false; 13657 } 13658 13659 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { 13660 if (V.getOpcode() != ISD::TRUNCATE) 13661 return false; 13662 13663 SDValue VOp0 = V.getOperand(0); 13664 unsigned InBits = VOp0.getValueSizeInBits(); 13665 unsigned Bits = V.getValueSizeInBits(); 13666 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); 13667 } 13668 13669 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 13670 bool addTest = true; 13671 SDValue Cond = Op.getOperand(0); 13672 SDValue Op1 = Op.getOperand(1); 13673 SDValue Op2 = Op.getOperand(2); 13674 SDLoc DL(Op); 13675 EVT VT = Op1.getValueType(); 13676 SDValue CC; 13677 13678 // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops 13679 // are available. Otherwise fp cmovs get lowered into a less efficient branch 13680 // sequence later on. 13681 if (Cond.getOpcode() == ISD::SETCC && 13682 ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || 13683 (Subtarget->hasSSE1() && VT == MVT::f32)) && 13684 VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { 13685 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); 13686 int SSECC = translateX86FSETCC( 13687 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); 13688 13689 if (SSECC != 8) { 13690 if (Subtarget->hasAVX512()) { 13691 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, 13692 DAG.getConstant(SSECC, MVT::i8)); 13693 return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); 13694 } 13695 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, 13696 DAG.getConstant(SSECC, MVT::i8)); 13697 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); 13698 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); 13699 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); 13700 } 13701 } 13702 13703 if (Cond.getOpcode() == ISD::SETCC) { 13704 SDValue NewCond = LowerSETCC(Cond, DAG); 13705 if (NewCond.getNode()) 13706 Cond = NewCond; 13707 } 13708 13709 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 13710 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 13711 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 13712 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 13713 if (Cond.getOpcode() == X86ISD::SETCC && 13714 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 13715 isZero(Cond.getOperand(1).getOperand(1))) { 13716 SDValue Cmp = Cond.getOperand(1); 13717 13718 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 13719 13720 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 13721 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 13722 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 13723 13724 SDValue CmpOp0 = Cmp.getOperand(0); 13725 // Apply further optimizations for special cases 13726 // (select (x != 0), -1, 0) -> neg & sbb 13727 // (select (x == 0), 0, -1) -> neg & sbb 13728 if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) 13729 if (YC->isNullValue() && 13730 (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { 13731 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); 13732 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, 13733 DAG.getConstant(0, CmpOp0.getValueType()), 13734 CmpOp0); 13735 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 13736 DAG.getConstant(X86::COND_B, MVT::i8), 13737 SDValue(Neg.getNode(), 1)); 13738 return Res; 13739 } 13740 13741 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 13742 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 13743 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 13744 13745 SDValue Res = // Res = 0 or -1. 13746 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 13747 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 13748 13749 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 13750 Res = DAG.getNOT(DL, Res, Res.getValueType()); 13751 13752 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 13753 if (!N2C || !N2C->isNullValue()) 13754 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 13755 return Res; 13756 } 13757 } 13758 13759 // Look past (and (setcc_carry (cmp ...)), 1). 13760 if (Cond.getOpcode() == ISD::AND && 13761 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 13762 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 13763 if (C && C->getAPIntValue() == 1) 13764 Cond = Cond.getOperand(0); 13765 } 13766 13767 // If condition flag is set by a X86ISD::CMP, then use it as the condition 13768 // setting operand in place of the X86ISD::SETCC. 13769 unsigned CondOpcode = Cond.getOpcode(); 13770 if (CondOpcode == X86ISD::SETCC || 13771 CondOpcode == X86ISD::SETCC_CARRY) { 13772 CC = Cond.getOperand(0); 13773 13774 SDValue Cmp = Cond.getOperand(1); 13775 unsigned Opc = Cmp.getOpcode(); 13776 MVT VT = Op.getSimpleValueType(); 13777 13778 bool IllegalFPCMov = false; 13779 if (VT.isFloatingPoint() && !VT.isVector() && 13780 !isScalarFPTypeInSSEReg(VT)) // FPStack? 13781 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 13782 13783 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 13784 Opc == X86ISD::BT) { // FIXME 13785 Cond = Cmp; 13786 addTest = false; 13787 } 13788 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 13789 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 13790 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 13791 Cond.getOperand(0).getValueType() != MVT::i8)) { 13792 SDValue LHS = Cond.getOperand(0); 13793 SDValue RHS = Cond.getOperand(1); 13794 unsigned X86Opcode; 13795 unsigned X86Cond; 13796 SDVTList VTs; 13797 switch (CondOpcode) { 13798 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 13799 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 13800 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 13801 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 13802 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 13803 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 13804 default: llvm_unreachable("unexpected overflowing operator"); 13805 } 13806 if (CondOpcode == ISD::UMULO) 13807 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 13808 MVT::i32); 13809 else 13810 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 13811 13812 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); 13813 13814 if (CondOpcode == ISD::UMULO) 13815 Cond = X86Op.getValue(2); 13816 else 13817 Cond = X86Op.getValue(1); 13818 13819 CC = DAG.getConstant(X86Cond, MVT::i8); 13820 addTest = false; 13821 } 13822 13823 if (addTest) { 13824 // Look pass the truncate if the high bits are known zero. 13825 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 13826 Cond = Cond.getOperand(0); 13827 13828 // We know the result of AND is compared against zero. Try to match 13829 // it to BT. 13830 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 13831 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 13832 if (NewSetCC.getNode()) { 13833 CC = NewSetCC.getOperand(0); 13834 Cond = NewSetCC.getOperand(1); 13835 addTest = false; 13836 } 13837 } 13838 } 13839 13840 if (addTest) { 13841 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 13842 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); 13843 } 13844 13845 // a < b ? -1 : 0 -> RES = ~setcc_carry 13846 // a < b ? 0 : -1 -> RES = setcc_carry 13847 // a >= b ? -1 : 0 -> RES = setcc_carry 13848 // a >= b ? 0 : -1 -> RES = ~setcc_carry 13849 if (Cond.getOpcode() == X86ISD::SUB) { 13850 Cond = ConvertCmpIfNecessary(Cond, DAG); 13851 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 13852 13853 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 13854 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 13855 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 13856 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 13857 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 13858 return DAG.getNOT(DL, Res, Res.getValueType()); 13859 return Res; 13860 } 13861 } 13862 13863 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate 13864 // widen the cmov and push the truncate through. This avoids introducing a new 13865 // branch during isel and doesn't add any extensions. 13866 if (Op.getValueType() == MVT::i8 && 13867 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { 13868 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); 13869 if (T1.getValueType() == T2.getValueType() && 13870 // Blacklist CopyFromReg to avoid partial register stalls. 13871 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ 13872 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); 13873 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); 13874 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); 13875 } 13876 } 13877 13878 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 13879 // condition is true. 13880 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 13881 SDValue Ops[] = { Op2, Op1, CC, Cond }; 13882 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); 13883 } 13884 13885 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { 13886 MVT VT = Op->getSimpleValueType(0); 13887 SDValue In = Op->getOperand(0); 13888 MVT InVT = In.getSimpleValueType(); 13889 SDLoc dl(Op); 13890 13891 unsigned int NumElts = VT.getVectorNumElements(); 13892 if (NumElts != 8 && NumElts != 16) 13893 return SDValue(); 13894 13895 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) 13896 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 13897 13898 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13899 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); 13900 13901 MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; 13902 Constant *C = ConstantInt::get(*DAG.getContext(), 13903 APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); 13904 13905 SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); 13906 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); 13907 SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, 13908 MachinePointerInfo::getConstantPool(), 13909 false, false, false, Alignment); 13910 SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); 13911 if (VT.is512BitVector()) 13912 return Brcst; 13913 return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); 13914 } 13915 13916 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, 13917 SelectionDAG &DAG) { 13918 MVT VT = Op->getSimpleValueType(0); 13919 SDValue In = Op->getOperand(0); 13920 MVT InVT = In.getSimpleValueType(); 13921 SDLoc dl(Op); 13922 13923 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) 13924 return LowerSIGN_EXTEND_AVX512(Op, DAG); 13925 13926 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && 13927 (VT != MVT::v8i32 || InVT != MVT::v8i16) && 13928 (VT != MVT::v16i16 || InVT != MVT::v16i8)) 13929 return SDValue(); 13930 13931 if (Subtarget->hasInt256()) 13932 return DAG.getNode(X86ISD::VSEXT, dl, VT, In); 13933 13934 // Optimize vectors in AVX mode 13935 // Sign extend v8i16 to v8i32 and 13936 // v4i32 to v4i64 13937 // 13938 // Divide input vector into two parts 13939 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} 13940 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 13941 // concat the vectors to original VT 13942 13943 unsigned NumElems = InVT.getVectorNumElements(); 13944 SDValue Undef = DAG.getUNDEF(InVT); 13945 13946 SmallVector<int,8> ShufMask1(NumElems, -1); 13947 for (unsigned i = 0; i != NumElems/2; ++i) 13948 ShufMask1[i] = i; 13949 13950 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); 13951 13952 SmallVector<int,8> ShufMask2(NumElems, -1); 13953 for (unsigned i = 0; i != NumElems/2; ++i) 13954 ShufMask2[i] = i + NumElems/2; 13955 13956 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); 13957 13958 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), 13959 VT.getVectorNumElements()/2); 13960 13961 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); 13962 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); 13963 13964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); 13965 } 13966 13967 // Lower vector extended loads using a shuffle. If SSSE3 is not available we 13968 // may emit an illegal shuffle but the expansion is still better than scalar 13969 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise 13970 // we'll emit a shuffle and a arithmetic shift. 13971 // TODO: It is possible to support ZExt by zeroing the undef values during 13972 // the shuffle phase or after the shuffle. 13973 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, 13974 SelectionDAG &DAG) { 13975 MVT RegVT = Op.getSimpleValueType(); 13976 assert(RegVT.isVector() && "We only custom lower vector sext loads."); 13977 assert(RegVT.isInteger() && 13978 "We only custom lower integer vector sext loads."); 13979 13980 // Nothing useful we can do without SSE2 shuffles. 13981 assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); 13982 13983 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); 13984 SDLoc dl(Ld); 13985 EVT MemVT = Ld->getMemoryVT(); 13986 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 13987 unsigned RegSz = RegVT.getSizeInBits(); 13988 13989 ISD::LoadExtType Ext = Ld->getExtensionType(); 13990 13991 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) 13992 && "Only anyext and sext are currently implemented."); 13993 assert(MemVT != RegVT && "Cannot extend to the same type"); 13994 assert(MemVT.isVector() && "Must load a vector from memory"); 13995 13996 unsigned NumElems = RegVT.getVectorNumElements(); 13997 unsigned MemSz = MemVT.getSizeInBits(); 13998 assert(RegSz > MemSz && "Register size must be greater than the mem size"); 13999 14000 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { 14001 // The only way in which we have a legal 256-bit vector result but not the 14002 // integer 256-bit operations needed to directly lower a sextload is if we 14003 // have AVX1 but not AVX2. In that case, we can always emit a sextload to 14004 // a 128-bit vector and a normal sign_extend to 256-bits that should get 14005 // correctly legalized. We do this late to allow the canonical form of 14006 // sextload to persist throughout the rest of the DAG combiner -- it wants 14007 // to fold together any extensions it can, and so will fuse a sign_extend 14008 // of an sextload into a sextload targeting a wider value. 14009 SDValue Load; 14010 if (MemSz == 128) { 14011 // Just switch this to a normal load. 14012 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " 14013 "it must be a legal 128-bit vector " 14014 "type!"); 14015 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), 14016 Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), 14017 Ld->isInvariant(), Ld->getAlignment()); 14018 } else { 14019 assert(MemSz < 128 && 14020 "Can't extend a type wider than 128 bits to a 256 bit vector!"); 14021 // Do an sext load to a 128-bit vector type. We want to use the same 14022 // number of elements, but elements half as wide. This will end up being 14023 // recursively lowered by this routine, but will succeed as we definitely 14024 // have all the necessary features if we're using AVX1. 14025 EVT HalfEltVT = 14026 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); 14027 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); 14028 Load = 14029 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), 14030 Ld->getPointerInfo(), MemVT, Ld->isVolatile(), 14031 Ld->isNonTemporal(), Ld->isInvariant(), 14032 Ld->getAlignment()); 14033 } 14034 14035 // Replace chain users with the new chain. 14036 assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); 14037 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); 14038 14039 // Finally, do a normal sign-extend to the desired register. 14040 return DAG.getSExtOrTrunc(Load, dl, RegVT); 14041 } 14042 14043 // All sizes must be a power of two. 14044 assert(isPowerOf2_32(RegSz * MemSz * NumElems) && 14045 "Non-power-of-two elements are not custom lowered!"); 14046 14047 // Attempt to load the original value using scalar loads. 14048 // Find the largest scalar type that divides the total loaded size. 14049 MVT SclrLoadTy = MVT::i8; 14050 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 14051 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 14052 MVT Tp = (MVT::SimpleValueType)tp; 14053 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { 14054 SclrLoadTy = Tp; 14055 } 14056 } 14057 14058 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 14059 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && 14060 (64 <= MemSz)) 14061 SclrLoadTy = MVT::f64; 14062 14063 // Calculate the number of scalar loads that we need to perform 14064 // in order to load our vector from memory. 14065 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); 14066 14067 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && 14068 "Can only lower sext loads with a single scalar load!"); 14069 14070 unsigned loadRegZize = RegSz; 14071 if (Ext == ISD::SEXTLOAD && RegSz == 256) 14072 loadRegZize /= 2; 14073 14074 // Represent our vector as a sequence of elements which are the 14075 // largest scalar that we can load. 14076 EVT LoadUnitVecVT = EVT::getVectorVT( 14077 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); 14078 14079 // Represent the data using the same element type that is stored in 14080 // memory. In practice, we ''widen'' MemVT. 14081 EVT WideVecVT = 14082 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 14083 loadRegZize / MemVT.getScalarType().getSizeInBits()); 14084 14085 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && 14086 "Invalid vector type"); 14087 14088 // We can't shuffle using an illegal type. 14089 assert(TLI.isTypeLegal(WideVecVT) && 14090 "We only lower types that form legal widened vector types"); 14091 14092 SmallVector<SDValue, 8> Chains; 14093 SDValue Ptr = Ld->getBasePtr(); 14094 SDValue Increment = 14095 DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); 14096 SDValue Res = DAG.getUNDEF(LoadUnitVecVT); 14097 14098 for (unsigned i = 0; i < NumLoads; ++i) { 14099 // Perform a single load. 14100 SDValue ScalarLoad = 14101 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 14102 Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), 14103 Ld->getAlignment()); 14104 Chains.push_back(ScalarLoad.getValue(1)); 14105 // Create the first element type using SCALAR_TO_VECTOR in order to avoid 14106 // another round of DAGCombining. 14107 if (i == 0) 14108 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); 14109 else 14110 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, 14111 ScalarLoad, DAG.getIntPtrConstant(i)); 14112 14113 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 14114 } 14115 14116 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 14117 14118 // Bitcast the loaded value to a vector of the original element type, in 14119 // the size of the target vector type. 14120 SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); 14121 unsigned SizeRatio = RegSz / MemSz; 14122 14123 if (Ext == ISD::SEXTLOAD) { 14124 // If we have SSE4.1, we can directly emit a VSEXT node. 14125 if (Subtarget->hasSSE41()) { 14126 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); 14127 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 14128 return Sext; 14129 } 14130 14131 // Otherwise we'll shuffle the small elements in the high bits of the 14132 // larger type and perform an arithmetic shift. If the shift is not legal 14133 // it's better to scalarize. 14134 assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && 14135 "We can't implement a sext load without an arithmetic right shift!"); 14136 14137 // Redistribute the loaded elements into the different locations. 14138 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 14139 for (unsigned i = 0; i != NumElems; ++i) 14140 ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; 14141 14142 SDValue Shuff = DAG.getVectorShuffle( 14143 WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 14144 14145 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 14146 14147 // Build the arithmetic shift. 14148 unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - 14149 MemVT.getVectorElementType().getSizeInBits(); 14150 Shuff = 14151 DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); 14152 14153 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 14154 return Shuff; 14155 } 14156 14157 // Redistribute the loaded elements into the different locations. 14158 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); 14159 for (unsigned i = 0; i != NumElems; ++i) 14160 ShuffleVec[i * SizeRatio] = i; 14161 14162 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, 14163 DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); 14164 14165 // Bitcast to the requested type. 14166 Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); 14167 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); 14168 return Shuff; 14169 } 14170 14171 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 14172 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 14173 // from the AND / OR. 14174 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 14175 Opc = Op.getOpcode(); 14176 if (Opc != ISD::OR && Opc != ISD::AND) 14177 return false; 14178 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 14179 Op.getOperand(0).hasOneUse() && 14180 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 14181 Op.getOperand(1).hasOneUse()); 14182 } 14183 14184 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 14185 // 1 and that the SETCC node has a single use. 14186 static bool isXor1OfSetCC(SDValue Op) { 14187 if (Op.getOpcode() != ISD::XOR) 14188 return false; 14189 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 14190 if (N1C && N1C->getAPIntValue() == 1) { 14191 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 14192 Op.getOperand(0).hasOneUse(); 14193 } 14194 return false; 14195 } 14196 14197 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 14198 bool addTest = true; 14199 SDValue Chain = Op.getOperand(0); 14200 SDValue Cond = Op.getOperand(1); 14201 SDValue Dest = Op.getOperand(2); 14202 SDLoc dl(Op); 14203 SDValue CC; 14204 bool Inverted = false; 14205 14206 if (Cond.getOpcode() == ISD::SETCC) { 14207 // Check for setcc([su]{add,sub,mul}o == 0). 14208 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && 14209 isa<ConstantSDNode>(Cond.getOperand(1)) && 14210 cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && 14211 Cond.getOperand(0).getResNo() == 1 && 14212 (Cond.getOperand(0).getOpcode() == ISD::SADDO || 14213 Cond.getOperand(0).getOpcode() == ISD::UADDO || 14214 Cond.getOperand(0).getOpcode() == ISD::SSUBO || 14215 Cond.getOperand(0).getOpcode() == ISD::USUBO || 14216 Cond.getOperand(0).getOpcode() == ISD::SMULO || 14217 Cond.getOperand(0).getOpcode() == ISD::UMULO)) { 14218 Inverted = true; 14219 Cond = Cond.getOperand(0); 14220 } else { 14221 SDValue NewCond = LowerSETCC(Cond, DAG); 14222 if (NewCond.getNode()) 14223 Cond = NewCond; 14224 } 14225 } 14226 #if 0 14227 // FIXME: LowerXALUO doesn't handle these!! 14228 else if (Cond.getOpcode() == X86ISD::ADD || 14229 Cond.getOpcode() == X86ISD::SUB || 14230 Cond.getOpcode() == X86ISD::SMUL || 14231 Cond.getOpcode() == X86ISD::UMUL) 14232 Cond = LowerXALUO(Cond, DAG); 14233 #endif 14234 14235 // Look pass (and (setcc_carry (cmp ...)), 1). 14236 if (Cond.getOpcode() == ISD::AND && 14237 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 14238 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 14239 if (C && C->getAPIntValue() == 1) 14240 Cond = Cond.getOperand(0); 14241 } 14242 14243 // If condition flag is set by a X86ISD::CMP, then use it as the condition 14244 // setting operand in place of the X86ISD::SETCC. 14245 unsigned CondOpcode = Cond.getOpcode(); 14246 if (CondOpcode == X86ISD::SETCC || 14247 CondOpcode == X86ISD::SETCC_CARRY) { 14248 CC = Cond.getOperand(0); 14249 14250 SDValue Cmp = Cond.getOperand(1); 14251 unsigned Opc = Cmp.getOpcode(); 14252 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 14253 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 14254 Cond = Cmp; 14255 addTest = false; 14256 } else { 14257 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 14258 default: break; 14259 case X86::COND_O: 14260 case X86::COND_B: 14261 // These can only come from an arithmetic instruction with overflow, 14262 // e.g. SADDO, UADDO. 14263 Cond = Cond.getNode()->getOperand(1); 14264 addTest = false; 14265 break; 14266 } 14267 } 14268 } 14269 CondOpcode = Cond.getOpcode(); 14270 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || 14271 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || 14272 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && 14273 Cond.getOperand(0).getValueType() != MVT::i8)) { 14274 SDValue LHS = Cond.getOperand(0); 14275 SDValue RHS = Cond.getOperand(1); 14276 unsigned X86Opcode; 14277 unsigned X86Cond; 14278 SDVTList VTs; 14279 // Keep this in sync with LowerXALUO, otherwise we might create redundant 14280 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and 14281 // X86ISD::INC). 14282 switch (CondOpcode) { 14283 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; 14284 case ISD::SADDO: 14285 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 14286 if (C->isOne()) { 14287 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; 14288 break; 14289 } 14290 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; 14291 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; 14292 case ISD::SSUBO: 14293 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 14294 if (C->isOne()) { 14295 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; 14296 break; 14297 } 14298 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; 14299 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; 14300 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; 14301 default: llvm_unreachable("unexpected overflowing operator"); 14302 } 14303 if (Inverted) 14304 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); 14305 if (CondOpcode == ISD::UMULO) 14306 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), 14307 MVT::i32); 14308 else 14309 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 14310 14311 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); 14312 14313 if (CondOpcode == ISD::UMULO) 14314 Cond = X86Op.getValue(2); 14315 else 14316 Cond = X86Op.getValue(1); 14317 14318 CC = DAG.getConstant(X86Cond, MVT::i8); 14319 addTest = false; 14320 } else { 14321 unsigned CondOpc; 14322 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 14323 SDValue Cmp = Cond.getOperand(0).getOperand(1); 14324 if (CondOpc == ISD::OR) { 14325 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 14326 // two branches instead of an explicit OR instruction with a 14327 // separate test. 14328 if (Cmp == Cond.getOperand(1).getOperand(1) && 14329 isX86LogicalCmp(Cmp)) { 14330 CC = Cond.getOperand(0).getOperand(0); 14331 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14332 Chain, Dest, CC, Cmp); 14333 CC = Cond.getOperand(1).getOperand(0); 14334 Cond = Cmp; 14335 addTest = false; 14336 } 14337 } else { // ISD::AND 14338 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 14339 // two branches instead of an explicit AND instruction with a 14340 // separate test. However, we only do this if this block doesn't 14341 // have a fall-through edge, because this requires an explicit 14342 // jmp when the condition is false. 14343 if (Cmp == Cond.getOperand(1).getOperand(1) && 14344 isX86LogicalCmp(Cmp) && 14345 Op.getNode()->hasOneUse()) { 14346 X86::CondCode CCode = 14347 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 14348 CCode = X86::GetOppositeBranchCondition(CCode); 14349 CC = DAG.getConstant(CCode, MVT::i8); 14350 SDNode *User = *Op.getNode()->use_begin(); 14351 // Look for an unconditional branch following this conditional branch. 14352 // We need this because we need to reverse the successors in order 14353 // to implement FCMP_OEQ. 14354 if (User->getOpcode() == ISD::BR) { 14355 SDValue FalseBB = User->getOperand(1); 14356 SDNode *NewBR = 14357 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 14358 assert(NewBR == User); 14359 (void)NewBR; 14360 Dest = FalseBB; 14361 14362 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14363 Chain, Dest, CC, Cmp); 14364 X86::CondCode CCode = 14365 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 14366 CCode = X86::GetOppositeBranchCondition(CCode); 14367 CC = DAG.getConstant(CCode, MVT::i8); 14368 Cond = Cmp; 14369 addTest = false; 14370 } 14371 } 14372 } 14373 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 14374 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 14375 // It should be transformed during dag combiner except when the condition 14376 // is set by a arithmetics with overflow node. 14377 X86::CondCode CCode = 14378 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 14379 CCode = X86::GetOppositeBranchCondition(CCode); 14380 CC = DAG.getConstant(CCode, MVT::i8); 14381 Cond = Cond.getOperand(0).getOperand(1); 14382 addTest = false; 14383 } else if (Cond.getOpcode() == ISD::SETCC && 14384 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { 14385 // For FCMP_OEQ, we can emit 14386 // two branches instead of an explicit AND instruction with a 14387 // separate test. However, we only do this if this block doesn't 14388 // have a fall-through edge, because this requires an explicit 14389 // jmp when the condition is false. 14390 if (Op.getNode()->hasOneUse()) { 14391 SDNode *User = *Op.getNode()->use_begin(); 14392 // Look for an unconditional branch following this conditional branch. 14393 // We need this because we need to reverse the successors in order 14394 // to implement FCMP_OEQ. 14395 if (User->getOpcode() == ISD::BR) { 14396 SDValue FalseBB = User->getOperand(1); 14397 SDNode *NewBR = 14398 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 14399 assert(NewBR == User); 14400 (void)NewBR; 14401 Dest = FalseBB; 14402 14403 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 14404 Cond.getOperand(0), Cond.getOperand(1)); 14405 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 14406 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 14407 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14408 Chain, Dest, CC, Cmp); 14409 CC = DAG.getConstant(X86::COND_P, MVT::i8); 14410 Cond = Cmp; 14411 addTest = false; 14412 } 14413 } 14414 } else if (Cond.getOpcode() == ISD::SETCC && 14415 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { 14416 // For FCMP_UNE, we can emit 14417 // two branches instead of an explicit AND instruction with a 14418 // separate test. However, we only do this if this block doesn't 14419 // have a fall-through edge, because this requires an explicit 14420 // jmp when the condition is false. 14421 if (Op.getNode()->hasOneUse()) { 14422 SDNode *User = *Op.getNode()->use_begin(); 14423 // Look for an unconditional branch following this conditional branch. 14424 // We need this because we need to reverse the successors in order 14425 // to implement FCMP_UNE. 14426 if (User->getOpcode() == ISD::BR) { 14427 SDValue FalseBB = User->getOperand(1); 14428 SDNode *NewBR = 14429 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 14430 assert(NewBR == User); 14431 (void)NewBR; 14432 14433 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 14434 Cond.getOperand(0), Cond.getOperand(1)); 14435 Cmp = ConvertCmpIfNecessary(Cmp, DAG); 14436 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 14437 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14438 Chain, Dest, CC, Cmp); 14439 CC = DAG.getConstant(X86::COND_NP, MVT::i8); 14440 Cond = Cmp; 14441 addTest = false; 14442 Dest = FalseBB; 14443 } 14444 } 14445 } 14446 } 14447 14448 if (addTest) { 14449 // Look pass the truncate if the high bits are known zero. 14450 if (isTruncWithZeroHighBitsInput(Cond, DAG)) 14451 Cond = Cond.getOperand(0); 14452 14453 // We know the result of AND is compared against zero. Try to match 14454 // it to BT. 14455 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 14456 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 14457 if (NewSetCC.getNode()) { 14458 CC = NewSetCC.getOperand(0); 14459 Cond = NewSetCC.getOperand(1); 14460 addTest = false; 14461 } 14462 } 14463 } 14464 14465 if (addTest) { 14466 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; 14467 CC = DAG.getConstant(X86Cond, MVT::i8); 14468 Cond = EmitTest(Cond, X86Cond, dl, DAG); 14469 } 14470 Cond = ConvertCmpIfNecessary(Cond, DAG); 14471 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 14472 Chain, Dest, CC, Cond); 14473 } 14474 14475 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 14476 // Calls to _alloca are needed to probe the stack when allocating more than 4k 14477 // bytes in one go. Touching the stack at 4K increments is necessary to ensure 14478 // that the guard pages used by the OS virtual memory manager are allocated in 14479 // correct sequence. 14480 SDValue 14481 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 14482 SelectionDAG &DAG) const { 14483 MachineFunction &MF = DAG.getMachineFunction(); 14484 bool SplitStack = MF.shouldSplitStack(); 14485 bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || 14486 SplitStack; 14487 SDLoc dl(Op); 14488 14489 if (!Lower) { 14490 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14491 SDNode* Node = Op.getNode(); 14492 14493 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); 14494 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" 14495 " not tell us which reg is the stack pointer!"); 14496 EVT VT = Node->getValueType(0); 14497 SDValue Tmp1 = SDValue(Node, 0); 14498 SDValue Tmp2 = SDValue(Node, 1); 14499 SDValue Tmp3 = Node->getOperand(2); 14500 SDValue Chain = Tmp1.getOperand(0); 14501 14502 // Chain the dynamic stack allocation so that it doesn't modify the stack 14503 // pointer when other instructions are using the stack. 14504 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), 14505 SDLoc(Node)); 14506 14507 SDValue Size = Tmp2.getOperand(1); 14508 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 14509 Chain = SP.getValue(1); 14510 unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); 14511 const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); 14512 unsigned StackAlign = TFI.getStackAlignment(); 14513 Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value 14514 if (Align > StackAlign) 14515 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, 14516 DAG.getConstant(-(uint64_t)Align, VT)); 14517 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain 14518 14519 Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), 14520 DAG.getIntPtrConstant(0, true), SDValue(), 14521 SDLoc(Node)); 14522 14523 SDValue Ops[2] = { Tmp1, Tmp2 }; 14524 return DAG.getMergeValues(Ops, dl); 14525 } 14526 14527 // Get the inputs. 14528 SDValue Chain = Op.getOperand(0); 14529 SDValue Size = Op.getOperand(1); 14530 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 14531 EVT VT = Op.getNode()->getValueType(0); 14532 14533 bool Is64Bit = Subtarget->is64Bit(); 14534 EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; 14535 14536 if (SplitStack) { 14537 MachineRegisterInfo &MRI = MF.getRegInfo(); 14538 14539 if (Is64Bit) { 14540 // The 64 bit implementation of segmented stacks needs to clobber both r10 14541 // r11. This makes it impossible to use it along with nested parameters. 14542 const Function *F = MF.getFunction(); 14543 14544 for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); 14545 I != E; ++I) 14546 if (I->hasNestAttr()) 14547 report_fatal_error("Cannot use segmented stacks with functions that " 14548 "have nested arguments."); 14549 } 14550 14551 const TargetRegisterClass *AddrRegClass = 14552 getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); 14553 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); 14554 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); 14555 SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, 14556 DAG.getRegister(Vreg, SPTy)); 14557 SDValue Ops1[2] = { Value, Chain }; 14558 return DAG.getMergeValues(Ops1, dl); 14559 } else { 14560 SDValue Flag; 14561 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 14562 14563 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 14564 Flag = Chain.getValue(1); 14565 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 14566 14567 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 14568 14569 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 14570 DAG.getSubtarget().getRegisterInfo()); 14571 unsigned SPReg = RegInfo->getStackRegister(); 14572 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); 14573 Chain = SP.getValue(1); 14574 14575 if (Align) { 14576 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 14577 DAG.getConstant(-(uint64_t)Align, VT)); 14578 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); 14579 } 14580 14581 SDValue Ops1[2] = { SP, Chain }; 14582 return DAG.getMergeValues(Ops1, dl); 14583 } 14584 } 14585 14586 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 14587 MachineFunction &MF = DAG.getMachineFunction(); 14588 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 14589 14590 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 14591 SDLoc DL(Op); 14592 14593 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 14594 // vastart just stores the address of the VarArgsFrameIndex slot into the 14595 // memory location argument. 14596 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 14597 getPointerTy()); 14598 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 14599 MachinePointerInfo(SV), false, false, 0); 14600 } 14601 14602 // __va_list_tag: 14603 // gp_offset (0 - 6 * 8) 14604 // fp_offset (48 - 48 + 8 * 16) 14605 // overflow_arg_area (point to parameters coming in memory). 14606 // reg_save_area 14607 SmallVector<SDValue, 8> MemOps; 14608 SDValue FIN = Op.getOperand(1); 14609 // Store gp_offset 14610 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 14611 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 14612 MVT::i32), 14613 FIN, MachinePointerInfo(SV), false, false, 0); 14614 MemOps.push_back(Store); 14615 14616 // Store fp_offset 14617 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 14618 FIN, DAG.getIntPtrConstant(4)); 14619 Store = DAG.getStore(Op.getOperand(0), DL, 14620 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 14621 MVT::i32), 14622 FIN, MachinePointerInfo(SV, 4), false, false, 0); 14623 MemOps.push_back(Store); 14624 14625 // Store ptr to overflow_arg_area 14626 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 14627 FIN, DAG.getIntPtrConstant(4)); 14628 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 14629 getPointerTy()); 14630 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 14631 MachinePointerInfo(SV, 8), 14632 false, false, 0); 14633 MemOps.push_back(Store); 14634 14635 // Store ptr to reg_save_area. 14636 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 14637 FIN, DAG.getIntPtrConstant(8)); 14638 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 14639 getPointerTy()); 14640 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 14641 MachinePointerInfo(SV, 16), false, false, 0); 14642 MemOps.push_back(Store); 14643 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 14644 } 14645 14646 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 14647 assert(Subtarget->is64Bit() && 14648 "LowerVAARG only handles 64-bit va_arg!"); 14649 assert((Subtarget->isTargetLinux() || 14650 Subtarget->isTargetDarwin()) && 14651 "Unhandled target in LowerVAARG"); 14652 assert(Op.getNode()->getNumOperands() == 4); 14653 SDValue Chain = Op.getOperand(0); 14654 SDValue SrcPtr = Op.getOperand(1); 14655 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 14656 unsigned Align = Op.getConstantOperandVal(3); 14657 SDLoc dl(Op); 14658 14659 EVT ArgVT = Op.getNode()->getValueType(0); 14660 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 14661 uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 14662 uint8_t ArgMode; 14663 14664 // Decide which area this value should be read from. 14665 // TODO: Implement the AMD64 ABI in its entirety. This simple 14666 // selection mechanism works only for the basic types. 14667 if (ArgVT == MVT::f80) { 14668 llvm_unreachable("va_arg for f80 not yet implemented"); 14669 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 14670 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 14671 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 14672 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 14673 } else { 14674 llvm_unreachable("Unhandled argument type in LowerVAARG"); 14675 } 14676 14677 if (ArgMode == 2) { 14678 // Sanity Check: Make sure using fp_offset makes sense. 14679 assert(!DAG.getTarget().Options.UseSoftFloat && 14680 !(DAG.getMachineFunction() 14681 .getFunction()->getAttributes() 14682 .hasAttribute(AttributeSet::FunctionIndex, 14683 Attribute::NoImplicitFloat)) && 14684 Subtarget->hasSSE1()); 14685 } 14686 14687 // Insert VAARG_64 node into the DAG 14688 // VAARG_64 returns two values: Variable Argument Address, Chain 14689 SmallVector<SDValue, 11> InstOps; 14690 InstOps.push_back(Chain); 14691 InstOps.push_back(SrcPtr); 14692 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 14693 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 14694 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 14695 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 14696 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 14697 VTs, InstOps, MVT::i64, 14698 MachinePointerInfo(SV), 14699 /*Align=*/0, 14700 /*Volatile=*/false, 14701 /*ReadMem=*/true, 14702 /*WriteMem=*/true); 14703 Chain = VAARG.getValue(1); 14704 14705 // Load the next argument and return it 14706 return DAG.getLoad(ArgVT, dl, 14707 Chain, 14708 VAARG, 14709 MachinePointerInfo(), 14710 false, false, false, 0); 14711 } 14712 14713 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, 14714 SelectionDAG &DAG) { 14715 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 14716 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 14717 SDValue Chain = Op.getOperand(0); 14718 SDValue DstPtr = Op.getOperand(1); 14719 SDValue SrcPtr = Op.getOperand(2); 14720 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 14721 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 14722 SDLoc DL(Op); 14723 14724 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 14725 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 14726 false, 14727 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 14728 } 14729 14730 // getTargetVShiftByConstNode - Handle vector element shifts where the shift 14731 // amount is a constant. Takes immediate version of shift as input. 14732 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, 14733 SDValue SrcOp, uint64_t ShiftAmt, 14734 SelectionDAG &DAG) { 14735 MVT ElementType = VT.getVectorElementType(); 14736 14737 // Fold this packed shift into its first operand if ShiftAmt is 0. 14738 if (ShiftAmt == 0) 14739 return SrcOp; 14740 14741 // Check for ShiftAmt >= element width 14742 if (ShiftAmt >= ElementType.getSizeInBits()) { 14743 if (Opc == X86ISD::VSRAI) 14744 ShiftAmt = ElementType.getSizeInBits() - 1; 14745 else 14746 return DAG.getConstant(0, VT); 14747 } 14748 14749 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) 14750 && "Unknown target vector shift-by-constant node"); 14751 14752 // Fold this packed vector shift into a build vector if SrcOp is a 14753 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. 14754 if (VT == SrcOp.getSimpleValueType() && 14755 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { 14756 SmallVector<SDValue, 8> Elts; 14757 unsigned NumElts = SrcOp->getNumOperands(); 14758 ConstantSDNode *ND; 14759 14760 switch(Opc) { 14761 default: llvm_unreachable(nullptr); 14762 case X86ISD::VSHLI: 14763 for (unsigned i=0; i!=NumElts; ++i) { 14764 SDValue CurrentOp = SrcOp->getOperand(i); 14765 if (CurrentOp->getOpcode() == ISD::UNDEF) { 14766 Elts.push_back(CurrentOp); 14767 continue; 14768 } 14769 ND = cast<ConstantSDNode>(CurrentOp); 14770 const APInt &C = ND->getAPIntValue(); 14771 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType)); 14772 } 14773 break; 14774 case X86ISD::VSRLI: 14775 for (unsigned i=0; i!=NumElts; ++i) { 14776 SDValue CurrentOp = SrcOp->getOperand(i); 14777 if (CurrentOp->getOpcode() == ISD::UNDEF) { 14778 Elts.push_back(CurrentOp); 14779 continue; 14780 } 14781 ND = cast<ConstantSDNode>(CurrentOp); 14782 const APInt &C = ND->getAPIntValue(); 14783 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType)); 14784 } 14785 break; 14786 case X86ISD::VSRAI: 14787 for (unsigned i=0; i!=NumElts; ++i) { 14788 SDValue CurrentOp = SrcOp->getOperand(i); 14789 if (CurrentOp->getOpcode() == ISD::UNDEF) { 14790 Elts.push_back(CurrentOp); 14791 continue; 14792 } 14793 ND = cast<ConstantSDNode>(CurrentOp); 14794 const APInt &C = ND->getAPIntValue(); 14795 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType)); 14796 } 14797 break; 14798 } 14799 14800 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); 14801 } 14802 14803 return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); 14804 } 14805 14806 // getTargetVShiftNode - Handle vector element shifts where the shift amount 14807 // may or may not be a constant. Takes immediate version of shift as input. 14808 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, 14809 SDValue SrcOp, SDValue ShAmt, 14810 SelectionDAG &DAG) { 14811 assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); 14812 14813 // Catch shift-by-constant. 14814 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) 14815 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, 14816 CShAmt->getZExtValue(), DAG); 14817 14818 // Change opcode to non-immediate version 14819 switch (Opc) { 14820 default: llvm_unreachable("Unknown target vector shift node"); 14821 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; 14822 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; 14823 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; 14824 } 14825 14826 // Need to build a vector containing shift amount 14827 // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 14828 SDValue ShOps[4]; 14829 ShOps[0] = ShAmt; 14830 ShOps[1] = DAG.getConstant(0, MVT::i32); 14831 ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); 14832 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); 14833 14834 // The return type has to be a 128-bit type with the same element 14835 // type as the input type. 14836 MVT EltVT = VT.getVectorElementType(); 14837 EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); 14838 14839 ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); 14840 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); 14841 } 14842 14843 /// \brief Return (vselect \p Mask, \p Op, \p PreservedSrc) along with the 14844 /// necessary casting for \p Mask when lowering masking intrinsics. 14845 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, 14846 SDValue PreservedSrc, SelectionDAG &DAG) { 14847 EVT VT = Op.getValueType(); 14848 EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), 14849 MVT::i1, VT.getVectorNumElements()); 14850 SDLoc dl(Op); 14851 14852 assert(MaskVT.isSimple() && "invalid mask type"); 14853 return DAG.getNode(ISD::VSELECT, dl, VT, 14854 DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask), 14855 Op, PreservedSrc); 14856 } 14857 14858 static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { 14859 switch (IntNo) { 14860 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 14861 case Intrinsic::x86_fma_vfmadd_ps: 14862 case Intrinsic::x86_fma_vfmadd_pd: 14863 case Intrinsic::x86_fma_vfmadd_ps_256: 14864 case Intrinsic::x86_fma_vfmadd_pd_256: 14865 case Intrinsic::x86_fma_mask_vfmadd_ps_512: 14866 case Intrinsic::x86_fma_mask_vfmadd_pd_512: 14867 return X86ISD::FMADD; 14868 case Intrinsic::x86_fma_vfmsub_ps: 14869 case Intrinsic::x86_fma_vfmsub_pd: 14870 case Intrinsic::x86_fma_vfmsub_ps_256: 14871 case Intrinsic::x86_fma_vfmsub_pd_256: 14872 case Intrinsic::x86_fma_mask_vfmsub_ps_512: 14873 case Intrinsic::x86_fma_mask_vfmsub_pd_512: 14874 return X86ISD::FMSUB; 14875 case Intrinsic::x86_fma_vfnmadd_ps: 14876 case Intrinsic::x86_fma_vfnmadd_pd: 14877 case Intrinsic::x86_fma_vfnmadd_ps_256: 14878 case Intrinsic::x86_fma_vfnmadd_pd_256: 14879 case Intrinsic::x86_fma_mask_vfnmadd_ps_512: 14880 case Intrinsic::x86_fma_mask_vfnmadd_pd_512: 14881 return X86ISD::FNMADD; 14882 case Intrinsic::x86_fma_vfnmsub_ps: 14883 case Intrinsic::x86_fma_vfnmsub_pd: 14884 case Intrinsic::x86_fma_vfnmsub_ps_256: 14885 case Intrinsic::x86_fma_vfnmsub_pd_256: 14886 case Intrinsic::x86_fma_mask_vfnmsub_ps_512: 14887 case Intrinsic::x86_fma_mask_vfnmsub_pd_512: 14888 return X86ISD::FNMSUB; 14889 case Intrinsic::x86_fma_vfmaddsub_ps: 14890 case Intrinsic::x86_fma_vfmaddsub_pd: 14891 case Intrinsic::x86_fma_vfmaddsub_ps_256: 14892 case Intrinsic::x86_fma_vfmaddsub_pd_256: 14893 case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: 14894 case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: 14895 return X86ISD::FMADDSUB; 14896 case Intrinsic::x86_fma_vfmsubadd_ps: 14897 case Intrinsic::x86_fma_vfmsubadd_pd: 14898 case Intrinsic::x86_fma_vfmsubadd_ps_256: 14899 case Intrinsic::x86_fma_vfmsubadd_pd_256: 14900 case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: 14901 case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: 14902 return X86ISD::FMSUBADD; 14903 } 14904 } 14905 14906 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 14907 SDLoc dl(Op); 14908 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 14909 14910 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); 14911 if (IntrData) { 14912 switch(IntrData->Type) { 14913 case INTR_TYPE_1OP: 14914 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); 14915 case INTR_TYPE_2OP: 14916 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 14917 Op.getOperand(2)); 14918 case INTR_TYPE_3OP: 14919 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), 14920 Op.getOperand(2), Op.getOperand(3)); 14921 case COMI: { // Comparison intrinsics 14922 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; 14923 SDValue LHS = Op.getOperand(1); 14924 SDValue RHS = Op.getOperand(2); 14925 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 14926 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 14927 SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); 14928 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 14929 DAG.getConstant(X86CC, MVT::i8), Cond); 14930 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 14931 } 14932 case VSHIFT: 14933 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), 14934 Op.getOperand(1), Op.getOperand(2), DAG); 14935 default: 14936 break; 14937 } 14938 } 14939 14940 switch (IntNo) { 14941 default: return SDValue(); // Don't custom lower most intrinsics. 14942 14943 // Arithmetic intrinsics. 14944 case Intrinsic::x86_sse2_pmulu_dq: 14945 case Intrinsic::x86_avx2_pmulu_dq: 14946 return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), 14947 Op.getOperand(1), Op.getOperand(2)); 14948 14949 case Intrinsic::x86_sse41_pmuldq: 14950 case Intrinsic::x86_avx2_pmul_dq: 14951 return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), 14952 Op.getOperand(1), Op.getOperand(2)); 14953 14954 case Intrinsic::x86_sse2_pmulhu_w: 14955 case Intrinsic::x86_avx2_pmulhu_w: 14956 return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), 14957 Op.getOperand(1), Op.getOperand(2)); 14958 14959 case Intrinsic::x86_sse2_pmulh_w: 14960 case Intrinsic::x86_avx2_pmulh_w: 14961 return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), 14962 Op.getOperand(1), Op.getOperand(2)); 14963 14964 // SSE/SSE2/AVX floating point max/min intrinsics. 14965 case Intrinsic::x86_sse_max_ps: 14966 case Intrinsic::x86_sse2_max_pd: 14967 case Intrinsic::x86_avx_max_ps_256: 14968 case Intrinsic::x86_avx_max_pd_256: 14969 case Intrinsic::x86_sse_min_ps: 14970 case Intrinsic::x86_sse2_min_pd: 14971 case Intrinsic::x86_avx_min_ps_256: 14972 case Intrinsic::x86_avx_min_pd_256: { 14973 unsigned Opcode; 14974 switch (IntNo) { 14975 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 14976 case Intrinsic::x86_sse_max_ps: 14977 case Intrinsic::x86_sse2_max_pd: 14978 case Intrinsic::x86_avx_max_ps_256: 14979 case Intrinsic::x86_avx_max_pd_256: 14980 Opcode = X86ISD::FMAX; 14981 break; 14982 case Intrinsic::x86_sse_min_ps: 14983 case Intrinsic::x86_sse2_min_pd: 14984 case Intrinsic::x86_avx_min_ps_256: 14985 case Intrinsic::x86_avx_min_pd_256: 14986 Opcode = X86ISD::FMIN; 14987 break; 14988 } 14989 return DAG.getNode(Opcode, dl, Op.getValueType(), 14990 Op.getOperand(1), Op.getOperand(2)); 14991 } 14992 14993 // AVX2 variable shift intrinsics 14994 case Intrinsic::x86_avx2_psllv_d: 14995 case Intrinsic::x86_avx2_psllv_q: 14996 case Intrinsic::x86_avx2_psllv_d_256: 14997 case Intrinsic::x86_avx2_psllv_q_256: 14998 case Intrinsic::x86_avx2_psrlv_d: 14999 case Intrinsic::x86_avx2_psrlv_q: 15000 case Intrinsic::x86_avx2_psrlv_d_256: 15001 case Intrinsic::x86_avx2_psrlv_q_256: 15002 case Intrinsic::x86_avx2_psrav_d: 15003 case Intrinsic::x86_avx2_psrav_d_256: { 15004 unsigned Opcode; 15005 switch (IntNo) { 15006 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 15007 case Intrinsic::x86_avx2_psllv_d: 15008 case Intrinsic::x86_avx2_psllv_q: 15009 case Intrinsic::x86_avx2_psllv_d_256: 15010 case Intrinsic::x86_avx2_psllv_q_256: 15011 Opcode = ISD::SHL; 15012 break; 15013 case Intrinsic::x86_avx2_psrlv_d: 15014 case Intrinsic::x86_avx2_psrlv_q: 15015 case Intrinsic::x86_avx2_psrlv_d_256: 15016 case Intrinsic::x86_avx2_psrlv_q_256: 15017 Opcode = ISD::SRL; 15018 break; 15019 case Intrinsic::x86_avx2_psrav_d: 15020 case Intrinsic::x86_avx2_psrav_d_256: 15021 Opcode = ISD::SRA; 15022 break; 15023 } 15024 return DAG.getNode(Opcode, dl, Op.getValueType(), 15025 Op.getOperand(1), Op.getOperand(2)); 15026 } 15027 15028 case Intrinsic::x86_sse2_packssdw_128: 15029 case Intrinsic::x86_sse2_packsswb_128: 15030 case Intrinsic::x86_avx2_packssdw: 15031 case Intrinsic::x86_avx2_packsswb: 15032 return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), 15033 Op.getOperand(1), Op.getOperand(2)); 15034 15035 case Intrinsic::x86_sse2_packuswb_128: 15036 case Intrinsic::x86_sse41_packusdw: 15037 case Intrinsic::x86_avx2_packuswb: 15038 case Intrinsic::x86_avx2_packusdw: 15039 return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), 15040 Op.getOperand(1), Op.getOperand(2)); 15041 15042 case Intrinsic::x86_ssse3_pshuf_b_128: 15043 case Intrinsic::x86_avx2_pshuf_b: 15044 return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), 15045 Op.getOperand(1), Op.getOperand(2)); 15046 15047 case Intrinsic::x86_sse2_pshuf_d: 15048 return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), 15049 Op.getOperand(1), Op.getOperand(2)); 15050 15051 case Intrinsic::x86_sse2_pshufl_w: 15052 return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), 15053 Op.getOperand(1), Op.getOperand(2)); 15054 15055 case Intrinsic::x86_sse2_pshufh_w: 15056 return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), 15057 Op.getOperand(1), Op.getOperand(2)); 15058 15059 case Intrinsic::x86_ssse3_psign_b_128: 15060 case Intrinsic::x86_ssse3_psign_w_128: 15061 case Intrinsic::x86_ssse3_psign_d_128: 15062 case Intrinsic::x86_avx2_psign_b: 15063 case Intrinsic::x86_avx2_psign_w: 15064 case Intrinsic::x86_avx2_psign_d: 15065 return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), 15066 Op.getOperand(1), Op.getOperand(2)); 15067 15068 case Intrinsic::x86_avx2_permd: 15069 case Intrinsic::x86_avx2_permps: 15070 // Operands intentionally swapped. Mask is last operand to intrinsic, 15071 // but second operand for node/instruction. 15072 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), 15073 Op.getOperand(2), Op.getOperand(1)); 15074 15075 case Intrinsic::x86_avx512_mask_valign_q_512: 15076 case Intrinsic::x86_avx512_mask_valign_d_512: 15077 // Vector source operands are swapped. 15078 return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, 15079 Op.getValueType(), Op.getOperand(2), 15080 Op.getOperand(1), 15081 Op.getOperand(3)), 15082 Op.getOperand(5), Op.getOperand(4), DAG); 15083 15084 // ptest and testp intrinsics. The intrinsic these come from are designed to 15085 // return an integer value, not just an instruction so lower it to the ptest 15086 // or testp pattern and a setcc for the result. 15087 case Intrinsic::x86_sse41_ptestz: 15088 case Intrinsic::x86_sse41_ptestc: 15089 case Intrinsic::x86_sse41_ptestnzc: 15090 case Intrinsic::x86_avx_ptestz_256: 15091 case Intrinsic::x86_avx_ptestc_256: 15092 case Intrinsic::x86_avx_ptestnzc_256: 15093 case Intrinsic::x86_avx_vtestz_ps: 15094 case Intrinsic::x86_avx_vtestc_ps: 15095 case Intrinsic::x86_avx_vtestnzc_ps: 15096 case Intrinsic::x86_avx_vtestz_pd: 15097 case Intrinsic::x86_avx_vtestc_pd: 15098 case Intrinsic::x86_avx_vtestnzc_pd: 15099 case Intrinsic::x86_avx_vtestz_ps_256: 15100 case Intrinsic::x86_avx_vtestc_ps_256: 15101 case Intrinsic::x86_avx_vtestnzc_ps_256: 15102 case Intrinsic::x86_avx_vtestz_pd_256: 15103 case Intrinsic::x86_avx_vtestc_pd_256: 15104 case Intrinsic::x86_avx_vtestnzc_pd_256: { 15105 bool IsTestPacked = false; 15106 unsigned X86CC; 15107 switch (IntNo) { 15108 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 15109 case Intrinsic::x86_avx_vtestz_ps: 15110 case Intrinsic::x86_avx_vtestz_pd: 15111 case Intrinsic::x86_avx_vtestz_ps_256: 15112 case Intrinsic::x86_avx_vtestz_pd_256: 15113 IsTestPacked = true; // Fallthrough 15114 case Intrinsic::x86_sse41_ptestz: 15115 case Intrinsic::x86_avx_ptestz_256: 15116 // ZF = 1 15117 X86CC = X86::COND_E; 15118 break; 15119 case Intrinsic::x86_avx_vtestc_ps: 15120 case Intrinsic::x86_avx_vtestc_pd: 15121 case Intrinsic::x86_avx_vtestc_ps_256: 15122 case Intrinsic::x86_avx_vtestc_pd_256: 15123 IsTestPacked = true; // Fallthrough 15124 case Intrinsic::x86_sse41_ptestc: 15125 case Intrinsic::x86_avx_ptestc_256: 15126 // CF = 1 15127 X86CC = X86::COND_B; 15128 break; 15129 case Intrinsic::x86_avx_vtestnzc_ps: 15130 case Intrinsic::x86_avx_vtestnzc_pd: 15131 case Intrinsic::x86_avx_vtestnzc_ps_256: 15132 case Intrinsic::x86_avx_vtestnzc_pd_256: 15133 IsTestPacked = true; // Fallthrough 15134 case Intrinsic::x86_sse41_ptestnzc: 15135 case Intrinsic::x86_avx_ptestnzc_256: 15136 // ZF and CF = 0 15137 X86CC = X86::COND_A; 15138 break; 15139 } 15140 15141 SDValue LHS = Op.getOperand(1); 15142 SDValue RHS = Op.getOperand(2); 15143 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 15144 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 15145 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 15146 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 15147 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 15148 } 15149 case Intrinsic::x86_avx512_kortestz_w: 15150 case Intrinsic::x86_avx512_kortestc_w: { 15151 unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; 15152 SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); 15153 SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); 15154 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 15155 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); 15156 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); 15157 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 15158 } 15159 15160 case Intrinsic::x86_sse42_pcmpistria128: 15161 case Intrinsic::x86_sse42_pcmpestria128: 15162 case Intrinsic::x86_sse42_pcmpistric128: 15163 case Intrinsic::x86_sse42_pcmpestric128: 15164 case Intrinsic::x86_sse42_pcmpistrio128: 15165 case Intrinsic::x86_sse42_pcmpestrio128: 15166 case Intrinsic::x86_sse42_pcmpistris128: 15167 case Intrinsic::x86_sse42_pcmpestris128: 15168 case Intrinsic::x86_sse42_pcmpistriz128: 15169 case Intrinsic::x86_sse42_pcmpestriz128: { 15170 unsigned Opcode; 15171 unsigned X86CC; 15172 switch (IntNo) { 15173 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 15174 case Intrinsic::x86_sse42_pcmpistria128: 15175 Opcode = X86ISD::PCMPISTRI; 15176 X86CC = X86::COND_A; 15177 break; 15178 case Intrinsic::x86_sse42_pcmpestria128: 15179 Opcode = X86ISD::PCMPESTRI; 15180 X86CC = X86::COND_A; 15181 break; 15182 case Intrinsic::x86_sse42_pcmpistric128: 15183 Opcode = X86ISD::PCMPISTRI; 15184 X86CC = X86::COND_B; 15185 break; 15186 case Intrinsic::x86_sse42_pcmpestric128: 15187 Opcode = X86ISD::PCMPESTRI; 15188 X86CC = X86::COND_B; 15189 break; 15190 case Intrinsic::x86_sse42_pcmpistrio128: 15191 Opcode = X86ISD::PCMPISTRI; 15192 X86CC = X86::COND_O; 15193 break; 15194 case Intrinsic::x86_sse42_pcmpestrio128: 15195 Opcode = X86ISD::PCMPESTRI; 15196 X86CC = X86::COND_O; 15197 break; 15198 case Intrinsic::x86_sse42_pcmpistris128: 15199 Opcode = X86ISD::PCMPISTRI; 15200 X86CC = X86::COND_S; 15201 break; 15202 case Intrinsic::x86_sse42_pcmpestris128: 15203 Opcode = X86ISD::PCMPESTRI; 15204 X86CC = X86::COND_S; 15205 break; 15206 case Intrinsic::x86_sse42_pcmpistriz128: 15207 Opcode = X86ISD::PCMPISTRI; 15208 X86CC = X86::COND_E; 15209 break; 15210 case Intrinsic::x86_sse42_pcmpestriz128: 15211 Opcode = X86ISD::PCMPESTRI; 15212 X86CC = X86::COND_E; 15213 break; 15214 } 15215 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 15216 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 15217 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); 15218 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15219 DAG.getConstant(X86CC, MVT::i8), 15220 SDValue(PCMP.getNode(), 1)); 15221 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 15222 } 15223 15224 case Intrinsic::x86_sse42_pcmpistri128: 15225 case Intrinsic::x86_sse42_pcmpestri128: { 15226 unsigned Opcode; 15227 if (IntNo == Intrinsic::x86_sse42_pcmpistri128) 15228 Opcode = X86ISD::PCMPISTRI; 15229 else 15230 Opcode = X86ISD::PCMPESTRI; 15231 15232 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); 15233 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 15234 return DAG.getNode(Opcode, dl, VTs, NewOps); 15235 } 15236 15237 case Intrinsic::x86_fma_mask_vfmadd_ps_512: 15238 case Intrinsic::x86_fma_mask_vfmadd_pd_512: 15239 case Intrinsic::x86_fma_mask_vfmsub_ps_512: 15240 case Intrinsic::x86_fma_mask_vfmsub_pd_512: 15241 case Intrinsic::x86_fma_mask_vfnmadd_ps_512: 15242 case Intrinsic::x86_fma_mask_vfnmadd_pd_512: 15243 case Intrinsic::x86_fma_mask_vfnmsub_ps_512: 15244 case Intrinsic::x86_fma_mask_vfnmsub_pd_512: 15245 case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: 15246 case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: 15247 case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: 15248 case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { 15249 auto *SAE = cast<ConstantSDNode>(Op.getOperand(5)); 15250 if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) 15251 return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), 15252 dl, Op.getValueType(), 15253 Op.getOperand(1), 15254 Op.getOperand(2), 15255 Op.getOperand(3)), 15256 Op.getOperand(4), Op.getOperand(1), DAG); 15257 else 15258 return SDValue(); 15259 } 15260 15261 case Intrinsic::x86_fma_vfmadd_ps: 15262 case Intrinsic::x86_fma_vfmadd_pd: 15263 case Intrinsic::x86_fma_vfmsub_ps: 15264 case Intrinsic::x86_fma_vfmsub_pd: 15265 case Intrinsic::x86_fma_vfnmadd_ps: 15266 case Intrinsic::x86_fma_vfnmadd_pd: 15267 case Intrinsic::x86_fma_vfnmsub_ps: 15268 case Intrinsic::x86_fma_vfnmsub_pd: 15269 case Intrinsic::x86_fma_vfmaddsub_ps: 15270 case Intrinsic::x86_fma_vfmaddsub_pd: 15271 case Intrinsic::x86_fma_vfmsubadd_ps: 15272 case Intrinsic::x86_fma_vfmsubadd_pd: 15273 case Intrinsic::x86_fma_vfmadd_ps_256: 15274 case Intrinsic::x86_fma_vfmadd_pd_256: 15275 case Intrinsic::x86_fma_vfmsub_ps_256: 15276 case Intrinsic::x86_fma_vfmsub_pd_256: 15277 case Intrinsic::x86_fma_vfnmadd_ps_256: 15278 case Intrinsic::x86_fma_vfnmadd_pd_256: 15279 case Intrinsic::x86_fma_vfnmsub_ps_256: 15280 case Intrinsic::x86_fma_vfnmsub_pd_256: 15281 case Intrinsic::x86_fma_vfmaddsub_ps_256: 15282 case Intrinsic::x86_fma_vfmaddsub_pd_256: 15283 case Intrinsic::x86_fma_vfmsubadd_ps_256: 15284 case Intrinsic::x86_fma_vfmsubadd_pd_256: 15285 return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), 15286 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 15287 } 15288 } 15289 15290 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 15291 SDValue Src, SDValue Mask, SDValue Base, 15292 SDValue Index, SDValue ScaleOp, SDValue Chain, 15293 const X86Subtarget * Subtarget) { 15294 SDLoc dl(Op); 15295 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 15296 assert(C && "Invalid scale type"); 15297 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 15298 EVT MaskVT = MVT::getVectorVT(MVT::i1, 15299 Index.getSimpleValueType().getVectorNumElements()); 15300 SDValue MaskInReg; 15301 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 15302 if (MaskC) 15303 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); 15304 else 15305 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 15306 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); 15307 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 15308 SDValue Segment = DAG.getRegister(0, MVT::i32); 15309 if (Src.getOpcode() == ISD::UNDEF) 15310 Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 15311 SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 15312 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 15313 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; 15314 return DAG.getMergeValues(RetOps, dl); 15315 } 15316 15317 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 15318 SDValue Src, SDValue Mask, SDValue Base, 15319 SDValue Index, SDValue ScaleOp, SDValue Chain) { 15320 SDLoc dl(Op); 15321 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 15322 assert(C && "Invalid scale type"); 15323 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 15324 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 15325 SDValue Segment = DAG.getRegister(0, MVT::i32); 15326 EVT MaskVT = MVT::getVectorVT(MVT::i1, 15327 Index.getSimpleValueType().getVectorNumElements()); 15328 SDValue MaskInReg; 15329 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 15330 if (MaskC) 15331 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); 15332 else 15333 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 15334 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); 15335 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; 15336 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); 15337 return SDValue(Res, 1); 15338 } 15339 15340 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, 15341 SDValue Mask, SDValue Base, SDValue Index, 15342 SDValue ScaleOp, SDValue Chain) { 15343 SDLoc dl(Op); 15344 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); 15345 assert(C && "Invalid scale type"); 15346 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); 15347 SDValue Disp = DAG.getTargetConstant(0, MVT::i32); 15348 SDValue Segment = DAG.getRegister(0, MVT::i32); 15349 EVT MaskVT = 15350 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); 15351 SDValue MaskInReg; 15352 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); 15353 if (MaskC) 15354 MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); 15355 else 15356 MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); 15357 //SDVTList VTs = DAG.getVTList(MVT::Other); 15358 SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; 15359 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); 15360 return SDValue(Res, 0); 15361 } 15362 15363 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that 15364 // read performance monitor counters (x86_rdpmc). 15365 static void getReadPerformanceCounter(SDNode *N, SDLoc DL, 15366 SelectionDAG &DAG, const X86Subtarget *Subtarget, 15367 SmallVectorImpl<SDValue> &Results) { 15368 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 15369 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 15370 SDValue LO, HI; 15371 15372 // The ECX register is used to select the index of the performance counter 15373 // to read. 15374 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, 15375 N->getOperand(2)); 15376 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); 15377 15378 // Reads the content of a 64-bit performance counter and returns it in the 15379 // registers EDX:EAX. 15380 if (Subtarget->is64Bit()) { 15381 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 15382 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 15383 LO.getValue(2)); 15384 } else { 15385 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 15386 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 15387 LO.getValue(2)); 15388 } 15389 Chain = HI.getValue(1); 15390 15391 if (Subtarget->is64Bit()) { 15392 // The EAX register is loaded with the low-order 32 bits. The EDX register 15393 // is loaded with the supported high-order bits of the counter. 15394 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 15395 DAG.getConstant(32, MVT::i8)); 15396 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 15397 Results.push_back(Chain); 15398 return; 15399 } 15400 15401 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 15402 SDValue Ops[] = { LO, HI }; 15403 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 15404 Results.push_back(Pair); 15405 Results.push_back(Chain); 15406 } 15407 15408 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that 15409 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is 15410 // also used to custom lower READCYCLECOUNTER nodes. 15411 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, 15412 SelectionDAG &DAG, const X86Subtarget *Subtarget, 15413 SmallVectorImpl<SDValue> &Results) { 15414 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 15415 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); 15416 SDValue LO, HI; 15417 15418 // The processor's time-stamp counter (a 64-bit MSR) is stored into the 15419 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR 15420 // and the EAX register is loaded with the low-order 32 bits. 15421 if (Subtarget->is64Bit()) { 15422 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); 15423 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, 15424 LO.getValue(2)); 15425 } else { 15426 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); 15427 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, 15428 LO.getValue(2)); 15429 } 15430 SDValue Chain = HI.getValue(1); 15431 15432 if (Opcode == X86ISD::RDTSCP_DAG) { 15433 assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); 15434 15435 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into 15436 // the ECX register. Add 'ecx' explicitly to the chain. 15437 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, 15438 HI.getValue(2)); 15439 // Explicitly store the content of ECX at the location passed in input 15440 // to the 'rdtscp' intrinsic. 15441 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), 15442 MachinePointerInfo(), false, false, 0); 15443 } 15444 15445 if (Subtarget->is64Bit()) { 15446 // The EDX register is loaded with the high-order 32 bits of the MSR, and 15447 // the EAX register is loaded with the low-order 32 bits. 15448 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, 15449 DAG.getConstant(32, MVT::i8)); 15450 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); 15451 Results.push_back(Chain); 15452 return; 15453 } 15454 15455 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 15456 SDValue Ops[] = { LO, HI }; 15457 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); 15458 Results.push_back(Pair); 15459 Results.push_back(Chain); 15460 } 15461 15462 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, 15463 SelectionDAG &DAG) { 15464 SmallVector<SDValue, 2> Results; 15465 SDLoc DL(Op); 15466 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, 15467 Results); 15468 return DAG.getMergeValues(Results, DL); 15469 } 15470 15471 15472 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, 15473 SelectionDAG &DAG) { 15474 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 15475 15476 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); 15477 if (!IntrData) 15478 return SDValue(); 15479 15480 SDLoc dl(Op); 15481 switch(IntrData->Type) { 15482 default: 15483 llvm_unreachable("Unknown Intrinsic Type"); 15484 break; 15485 case RDSEED: 15486 case RDRAND: { 15487 // Emit the node with the right value type. 15488 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); 15489 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 15490 15491 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. 15492 // Otherwise return the value from Rand, which is always 0, casted to i32. 15493 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), 15494 DAG.getConstant(1, Op->getValueType(1)), 15495 DAG.getConstant(X86::COND_B, MVT::i32), 15496 SDValue(Result.getNode(), 1) }; 15497 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, 15498 DAG.getVTList(Op->getValueType(1), MVT::Glue), 15499 Ops); 15500 15501 // Return { result, isValid, chain }. 15502 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, 15503 SDValue(Result.getNode(), 2)); 15504 } 15505 case GATHER: { 15506 //gather(v1, mask, index, base, scale); 15507 SDValue Chain = Op.getOperand(0); 15508 SDValue Src = Op.getOperand(2); 15509 SDValue Base = Op.getOperand(3); 15510 SDValue Index = Op.getOperand(4); 15511 SDValue Mask = Op.getOperand(5); 15512 SDValue Scale = Op.getOperand(6); 15513 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, 15514 Subtarget); 15515 } 15516 case SCATTER: { 15517 //scatter(base, mask, index, v1, scale); 15518 SDValue Chain = Op.getOperand(0); 15519 SDValue Base = Op.getOperand(2); 15520 SDValue Mask = Op.getOperand(3); 15521 SDValue Index = Op.getOperand(4); 15522 SDValue Src = Op.getOperand(5); 15523 SDValue Scale = Op.getOperand(6); 15524 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); 15525 } 15526 case PREFETCH: { 15527 SDValue Hint = Op.getOperand(6); 15528 unsigned HintVal; 15529 if (dyn_cast<ConstantSDNode> (Hint) == nullptr || 15530 (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) 15531 llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); 15532 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); 15533 SDValue Chain = Op.getOperand(0); 15534 SDValue Mask = Op.getOperand(2); 15535 SDValue Index = Op.getOperand(3); 15536 SDValue Base = Op.getOperand(4); 15537 SDValue Scale = Op.getOperand(5); 15538 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); 15539 } 15540 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). 15541 case RDTSC: { 15542 SmallVector<SDValue, 2> Results; 15543 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); 15544 return DAG.getMergeValues(Results, dl); 15545 } 15546 // Read Performance Monitoring Counters. 15547 case RDPMC: { 15548 SmallVector<SDValue, 2> Results; 15549 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); 15550 return DAG.getMergeValues(Results, dl); 15551 } 15552 // XTEST intrinsics. 15553 case XTEST: { 15554 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 15555 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); 15556 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15557 DAG.getConstant(X86::COND_NE, MVT::i8), 15558 InTrans); 15559 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); 15560 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), 15561 Ret, SDValue(InTrans.getNode(), 1)); 15562 } 15563 // ADC/ADCX/SBB 15564 case ADX: { 15565 SmallVector<SDValue, 2> Results; 15566 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); 15567 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); 15568 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), 15569 DAG.getConstant(-1, MVT::i8)); 15570 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), 15571 Op.getOperand(4), GenCF.getValue(1)); 15572 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), 15573 Op.getOperand(5), MachinePointerInfo(), 15574 false, false, 0); 15575 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 15576 DAG.getConstant(X86::COND_B, MVT::i8), 15577 Res.getValue(1)); 15578 Results.push_back(SetCC); 15579 Results.push_back(Store); 15580 return DAG.getMergeValues(Results, dl); 15581 } 15582 } 15583 } 15584 15585 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 15586 SelectionDAG &DAG) const { 15587 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 15588 MFI->setReturnAddressIsTaken(true); 15589 15590 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 15591 return SDValue(); 15592 15593 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 15594 SDLoc dl(Op); 15595 EVT PtrVT = getPointerTy(); 15596 15597 if (Depth > 0) { 15598 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 15599 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 15600 DAG.getSubtarget().getRegisterInfo()); 15601 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); 15602 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 15603 DAG.getNode(ISD::ADD, dl, PtrVT, 15604 FrameAddr, Offset), 15605 MachinePointerInfo(), false, false, false, 0); 15606 } 15607 15608 // Just load the return address. 15609 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 15610 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 15611 RetAddrFI, MachinePointerInfo(), false, false, false, 0); 15612 } 15613 15614 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 15615 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 15616 MFI->setFrameAddressIsTaken(true); 15617 15618 EVT VT = Op.getValueType(); 15619 SDLoc dl(Op); // FIXME probably not meaningful 15620 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 15621 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 15622 DAG.getSubtarget().getRegisterInfo()); 15623 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 15624 assert(((FrameReg == X86::RBP && VT == MVT::i64) || 15625 (FrameReg == X86::EBP && VT == MVT::i32)) && 15626 "Invalid Frame Register!"); 15627 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 15628 while (Depth--) 15629 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 15630 MachinePointerInfo(), 15631 false, false, false, 0); 15632 return FrameAddr; 15633 } 15634 15635 // FIXME? Maybe this could be a TableGen attribute on some registers and 15636 // this table could be generated automatically from RegInfo. 15637 unsigned X86TargetLowering::getRegisterByName(const char* RegName, 15638 EVT VT) const { 15639 unsigned Reg = StringSwitch<unsigned>(RegName) 15640 .Case("esp", X86::ESP) 15641 .Case("rsp", X86::RSP) 15642 .Default(0); 15643 if (Reg) 15644 return Reg; 15645 report_fatal_error("Invalid register name global variable"); 15646 } 15647 15648 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 15649 SelectionDAG &DAG) const { 15650 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 15651 DAG.getSubtarget().getRegisterInfo()); 15652 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); 15653 } 15654 15655 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 15656 SDValue Chain = Op.getOperand(0); 15657 SDValue Offset = Op.getOperand(1); 15658 SDValue Handler = Op.getOperand(2); 15659 SDLoc dl (Op); 15660 15661 EVT PtrVT = getPointerTy(); 15662 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 15663 DAG.getSubtarget().getRegisterInfo()); 15664 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); 15665 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || 15666 (FrameReg == X86::EBP && PtrVT == MVT::i32)) && 15667 "Invalid Frame Register!"); 15668 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); 15669 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; 15670 15671 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, 15672 DAG.getIntPtrConstant(RegInfo->getSlotSize())); 15673 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); 15674 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 15675 false, false, 0); 15676 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 15677 15678 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, 15679 DAG.getRegister(StoreAddrReg, PtrVT)); 15680 } 15681 15682 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 15683 SelectionDAG &DAG) const { 15684 SDLoc DL(Op); 15685 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, 15686 DAG.getVTList(MVT::i32, MVT::Other), 15687 Op.getOperand(0), Op.getOperand(1)); 15688 } 15689 15690 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 15691 SelectionDAG &DAG) const { 15692 SDLoc DL(Op); 15693 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 15694 Op.getOperand(0), Op.getOperand(1)); 15695 } 15696 15697 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { 15698 return Op.getOperand(0); 15699 } 15700 15701 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 15702 SelectionDAG &DAG) const { 15703 SDValue Root = Op.getOperand(0); 15704 SDValue Trmp = Op.getOperand(1); // trampoline 15705 SDValue FPtr = Op.getOperand(2); // nested function 15706 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 15707 SDLoc dl (Op); 15708 15709 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 15710 const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); 15711 15712 if (Subtarget->is64Bit()) { 15713 SDValue OutChains[6]; 15714 15715 // Large code-model. 15716 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 15717 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 15718 15719 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; 15720 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; 15721 15722 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 15723 15724 // Load the pointer to the nested function into R11. 15725 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 15726 SDValue Addr = Trmp; 15727 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 15728 Addr, MachinePointerInfo(TrmpAddr), 15729 false, false, 0); 15730 15731 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15732 DAG.getConstant(2, MVT::i64)); 15733 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 15734 MachinePointerInfo(TrmpAddr, 2), 15735 false, false, 2); 15736 15737 // Load the 'nest' parameter value into R10. 15738 // R10 is specified in X86CallingConv.td 15739 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 15740 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15741 DAG.getConstant(10, MVT::i64)); 15742 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 15743 Addr, MachinePointerInfo(TrmpAddr, 10), 15744 false, false, 0); 15745 15746 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15747 DAG.getConstant(12, MVT::i64)); 15748 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 15749 MachinePointerInfo(TrmpAddr, 12), 15750 false, false, 2); 15751 15752 // Jump to the nested function. 15753 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 15754 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15755 DAG.getConstant(20, MVT::i64)); 15756 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 15757 Addr, MachinePointerInfo(TrmpAddr, 20), 15758 false, false, 0); 15759 15760 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 15761 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 15762 DAG.getConstant(22, MVT::i64)); 15763 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 15764 MachinePointerInfo(TrmpAddr, 22), 15765 false, false, 0); 15766 15767 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 15768 } else { 15769 const Function *Func = 15770 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 15771 CallingConv::ID CC = Func->getCallingConv(); 15772 unsigned NestReg; 15773 15774 switch (CC) { 15775 default: 15776 llvm_unreachable("Unsupported calling convention"); 15777 case CallingConv::C: 15778 case CallingConv::X86_StdCall: { 15779 // Pass 'nest' parameter in ECX. 15780 // Must be kept in sync with X86CallingConv.td 15781 NestReg = X86::ECX; 15782 15783 // Check that ECX wasn't needed by an 'inreg' parameter. 15784 FunctionType *FTy = Func->getFunctionType(); 15785 const AttributeSet &Attrs = Func->getAttributes(); 15786 15787 if (!Attrs.isEmpty() && !Func->isVarArg()) { 15788 unsigned InRegCount = 0; 15789 unsigned Idx = 1; 15790 15791 for (FunctionType::param_iterator I = FTy->param_begin(), 15792 E = FTy->param_end(); I != E; ++I, ++Idx) 15793 if (Attrs.hasAttribute(Idx, Attribute::InReg)) 15794 // FIXME: should only count parameters that are lowered to integers. 15795 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 15796 15797 if (InRegCount > 2) { 15798 report_fatal_error("Nest register in use - reduce number of inreg" 15799 " parameters!"); 15800 } 15801 } 15802 break; 15803 } 15804 case CallingConv::X86_FastCall: 15805 case CallingConv::X86_ThisCall: 15806 case CallingConv::Fast: 15807 // Pass 'nest' parameter in EAX. 15808 // Must be kept in sync with X86CallingConv.td 15809 NestReg = X86::EAX; 15810 break; 15811 } 15812 15813 SDValue OutChains[4]; 15814 SDValue Addr, Disp; 15815 15816 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15817 DAG.getConstant(10, MVT::i32)); 15818 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 15819 15820 // This is storing the opcode for MOV32ri. 15821 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 15822 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; 15823 OutChains[0] = DAG.getStore(Root, dl, 15824 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 15825 Trmp, MachinePointerInfo(TrmpAddr), 15826 false, false, 0); 15827 15828 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15829 DAG.getConstant(1, MVT::i32)); 15830 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 15831 MachinePointerInfo(TrmpAddr, 1), 15832 false, false, 1); 15833 15834 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 15835 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15836 DAG.getConstant(5, MVT::i32)); 15837 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 15838 MachinePointerInfo(TrmpAddr, 5), 15839 false, false, 1); 15840 15841 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 15842 DAG.getConstant(6, MVT::i32)); 15843 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 15844 MachinePointerInfo(TrmpAddr, 6), 15845 false, false, 1); 15846 15847 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); 15848 } 15849 } 15850 15851 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 15852 SelectionDAG &DAG) const { 15853 /* 15854 The rounding mode is in bits 11:10 of FPSR, and has the following 15855 settings: 15856 00 Round to nearest 15857 01 Round to -inf 15858 10 Round to +inf 15859 11 Round to 0 15860 15861 FLT_ROUNDS, on the other hand, expects the following: 15862 -1 Undefined 15863 0 Round to 0 15864 1 Round to nearest 15865 2 Round to +inf 15866 3 Round to -inf 15867 15868 To perform the conversion, we do: 15869 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 15870 */ 15871 15872 MachineFunction &MF = DAG.getMachineFunction(); 15873 const TargetMachine &TM = MF.getTarget(); 15874 const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); 15875 unsigned StackAlignment = TFI.getStackAlignment(); 15876 MVT VT = Op.getSimpleValueType(); 15877 SDLoc DL(Op); 15878 15879 // Save FP Control Word to stack slot 15880 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 15881 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 15882 15883 MachineMemOperand *MMO = 15884 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 15885 MachineMemOperand::MOStore, 2, 2); 15886 15887 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 15888 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 15889 DAG.getVTList(MVT::Other), 15890 Ops, MVT::i16, MMO); 15891 15892 // Load FP Control Word from stack slot 15893 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 15894 MachinePointerInfo(), false, false, false, 0); 15895 15896 // Transform as necessary 15897 SDValue CWD1 = 15898 DAG.getNode(ISD::SRL, DL, MVT::i16, 15899 DAG.getNode(ISD::AND, DL, MVT::i16, 15900 CWD, DAG.getConstant(0x800, MVT::i16)), 15901 DAG.getConstant(11, MVT::i8)); 15902 SDValue CWD2 = 15903 DAG.getNode(ISD::SRL, DL, MVT::i16, 15904 DAG.getNode(ISD::AND, DL, MVT::i16, 15905 CWD, DAG.getConstant(0x400, MVT::i16)), 15906 DAG.getConstant(9, MVT::i8)); 15907 15908 SDValue RetVal = 15909 DAG.getNode(ISD::AND, DL, MVT::i16, 15910 DAG.getNode(ISD::ADD, DL, MVT::i16, 15911 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 15912 DAG.getConstant(1, MVT::i16)), 15913 DAG.getConstant(3, MVT::i16)); 15914 15915 return DAG.getNode((VT.getSizeInBits() < 16 ? 15916 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 15917 } 15918 15919 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 15920 MVT VT = Op.getSimpleValueType(); 15921 EVT OpVT = VT; 15922 unsigned NumBits = VT.getSizeInBits(); 15923 SDLoc dl(Op); 15924 15925 Op = Op.getOperand(0); 15926 if (VT == MVT::i8) { 15927 // Zero extend to i32 since there is not an i8 bsr. 15928 OpVT = MVT::i32; 15929 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 15930 } 15931 15932 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 15933 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 15934 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 15935 15936 // If src is zero (i.e. bsr sets ZF), returns NumBits. 15937 SDValue Ops[] = { 15938 Op, 15939 DAG.getConstant(NumBits+NumBits-1, OpVT), 15940 DAG.getConstant(X86::COND_E, MVT::i8), 15941 Op.getValue(1) 15942 }; 15943 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); 15944 15945 // Finally xor with NumBits-1. 15946 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 15947 15948 if (VT == MVT::i8) 15949 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 15950 return Op; 15951 } 15952 15953 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { 15954 MVT VT = Op.getSimpleValueType(); 15955 EVT OpVT = VT; 15956 unsigned NumBits = VT.getSizeInBits(); 15957 SDLoc dl(Op); 15958 15959 Op = Op.getOperand(0); 15960 if (VT == MVT::i8) { 15961 // Zero extend to i32 since there is not an i8 bsr. 15962 OpVT = MVT::i32; 15963 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 15964 } 15965 15966 // Issue a bsr (scan bits in reverse). 15967 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 15968 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 15969 15970 // And xor with NumBits-1. 15971 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 15972 15973 if (VT == MVT::i8) 15974 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 15975 return Op; 15976 } 15977 15978 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 15979 MVT VT = Op.getSimpleValueType(); 15980 unsigned NumBits = VT.getSizeInBits(); 15981 SDLoc dl(Op); 15982 Op = Op.getOperand(0); 15983 15984 // Issue a bsf (scan bits forward) which also sets EFLAGS. 15985 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 15986 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 15987 15988 // If src is zero (i.e. bsf sets ZF), returns NumBits. 15989 SDValue Ops[] = { 15990 Op, 15991 DAG.getConstant(NumBits, VT), 15992 DAG.getConstant(X86::COND_E, MVT::i8), 15993 Op.getValue(1) 15994 }; 15995 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); 15996 } 15997 15998 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit 15999 // ones, and then concatenate the result back. 16000 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { 16001 MVT VT = Op.getSimpleValueType(); 16002 16003 assert(VT.is256BitVector() && VT.isInteger() && 16004 "Unsupported value type for operation"); 16005 16006 unsigned NumElems = VT.getVectorNumElements(); 16007 SDLoc dl(Op); 16008 16009 // Extract the LHS vectors 16010 SDValue LHS = Op.getOperand(0); 16011 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 16012 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 16013 16014 // Extract the RHS vectors 16015 SDValue RHS = Op.getOperand(1); 16016 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); 16017 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); 16018 16019 MVT EltVT = VT.getVectorElementType(); 16020 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 16021 16022 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 16023 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), 16024 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); 16025 } 16026 16027 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { 16028 assert(Op.getSimpleValueType().is256BitVector() && 16029 Op.getSimpleValueType().isInteger() && 16030 "Only handle AVX 256-bit vector integer operation"); 16031 return Lower256IntArith(Op, DAG); 16032 } 16033 16034 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { 16035 assert(Op.getSimpleValueType().is256BitVector() && 16036 Op.getSimpleValueType().isInteger() && 16037 "Only handle AVX 256-bit vector integer operation"); 16038 return Lower256IntArith(Op, DAG); 16039 } 16040 16041 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, 16042 SelectionDAG &DAG) { 16043 SDLoc dl(Op); 16044 MVT VT = Op.getSimpleValueType(); 16045 16046 // Decompose 256-bit ops into smaller 128-bit ops. 16047 if (VT.is256BitVector() && !Subtarget->hasInt256()) 16048 return Lower256IntArith(Op, DAG); 16049 16050 SDValue A = Op.getOperand(0); 16051 SDValue B = Op.getOperand(1); 16052 16053 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. 16054 if (VT == MVT::v4i32) { 16055 assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && 16056 "Should not custom lower when pmuldq is available!"); 16057 16058 // Extract the odd parts. 16059 static const int UnpackMask[] = { 1, -1, 3, -1 }; 16060 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); 16061 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); 16062 16063 // Multiply the even parts. 16064 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); 16065 // Now multiply odd parts. 16066 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); 16067 16068 Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); 16069 Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); 16070 16071 // Merge the two vectors back together with a shuffle. This expands into 2 16072 // shuffles. 16073 static const int ShufMask[] = { 0, 4, 2, 6 }; 16074 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); 16075 } 16076 16077 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && 16078 "Only know how to lower V2I64/V4I64/V8I64 multiply"); 16079 16080 // Ahi = psrlqi(a, 32); 16081 // Bhi = psrlqi(b, 32); 16082 // 16083 // AloBlo = pmuludq(a, b); 16084 // AloBhi = pmuludq(a, Bhi); 16085 // AhiBlo = pmuludq(Ahi, b); 16086 16087 // AloBhi = psllqi(AloBhi, 32); 16088 // AhiBlo = psllqi(AhiBlo, 32); 16089 // return AloBlo + AloBhi + AhiBlo; 16090 16091 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); 16092 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); 16093 16094 // Bit cast to 32-bit vectors for MULUDQ 16095 EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : 16096 (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; 16097 A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); 16098 B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); 16099 Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); 16100 Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); 16101 16102 SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); 16103 SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); 16104 SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); 16105 16106 AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); 16107 AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); 16108 16109 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 16110 return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 16111 } 16112 16113 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { 16114 assert(Subtarget->isTargetWin64() && "Unexpected target"); 16115 EVT VT = Op.getValueType(); 16116 assert(VT.isInteger() && VT.getSizeInBits() == 128 && 16117 "Unexpected return type for lowering"); 16118 16119 RTLIB::Libcall LC; 16120 bool isSigned; 16121 switch (Op->getOpcode()) { 16122 default: llvm_unreachable("Unexpected request for libcall!"); 16123 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; 16124 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; 16125 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; 16126 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; 16127 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; 16128 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; 16129 } 16130 16131 SDLoc dl(Op); 16132 SDValue InChain = DAG.getEntryNode(); 16133 16134 TargetLowering::ArgListTy Args; 16135 TargetLowering::ArgListEntry Entry; 16136 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 16137 EVT ArgVT = Op->getOperand(i).getValueType(); 16138 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && 16139 "Unexpected argument type for lowering"); 16140 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); 16141 Entry.Node = StackPtr; 16142 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), 16143 false, false, 16); 16144 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 16145 Entry.Ty = PointerType::get(ArgTy,0); 16146 Entry.isSExt = false; 16147 Entry.isZExt = false; 16148 Args.push_back(Entry); 16149 } 16150 16151 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 16152 getPointerTy()); 16153 16154 TargetLowering::CallLoweringInfo CLI(DAG); 16155 CLI.setDebugLoc(dl).setChain(InChain) 16156 .setCallee(getLibcallCallingConv(LC), 16157 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), 16158 Callee, std::move(Args), 0) 16159 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 16160 16161 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 16162 return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first); 16163 } 16164 16165 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, 16166 SelectionDAG &DAG) { 16167 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); 16168 EVT VT = Op0.getValueType(); 16169 SDLoc dl(Op); 16170 16171 assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || 16172 (VT == MVT::v8i32 && Subtarget->hasInt256())); 16173 16174 // PMULxD operations multiply each even value (starting at 0) of LHS with 16175 // the related value of RHS and produce a widen result. 16176 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 16177 // => <2 x i64> <ae|cg> 16178 // 16179 // In other word, to have all the results, we need to perform two PMULxD: 16180 // 1. one with the even values. 16181 // 2. one with the odd values. 16182 // To achieve #2, with need to place the odd values at an even position. 16183 // 16184 // Place the odd value at an even position (basically, shift all values 1 16185 // step to the left): 16186 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; 16187 // <a|b|c|d> => <b|undef|d|undef> 16188 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); 16189 // <e|f|g|h> => <f|undef|h|undef> 16190 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); 16191 16192 // Emit two multiplies, one for the lower 2 ints and one for the higher 2 16193 // ints. 16194 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; 16195 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; 16196 unsigned Opcode = 16197 (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; 16198 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> 16199 // => <2 x i64> <ae|cg> 16200 SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, 16201 DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); 16202 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> 16203 // => <2 x i64> <bf|dh> 16204 SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, 16205 DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); 16206 16207 // Shuffle it back into the right order. 16208 SDValue Highs, Lows; 16209 if (VT == MVT::v8i32) { 16210 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; 16211 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 16212 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; 16213 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 16214 } else { 16215 const int HighMask[] = {1, 5, 3, 7}; 16216 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); 16217 const int LowMask[] = {0, 4, 2, 6}; 16218 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); 16219 } 16220 16221 // If we have a signed multiply but no PMULDQ fix up the high parts of a 16222 // unsigned multiply. 16223 if (IsSigned && !Subtarget->hasSSE41()) { 16224 SDValue ShAmt = 16225 DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); 16226 SDValue T1 = DAG.getNode(ISD::AND, dl, VT, 16227 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); 16228 SDValue T2 = DAG.getNode(ISD::AND, dl, VT, 16229 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); 16230 16231 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); 16232 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); 16233 } 16234 16235 // The first result of MUL_LOHI is actually the low value, followed by the 16236 // high value. 16237 SDValue Ops[] = {Lows, Highs}; 16238 return DAG.getMergeValues(Ops, dl); 16239 } 16240 16241 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, 16242 const X86Subtarget *Subtarget) { 16243 MVT VT = Op.getSimpleValueType(); 16244 SDLoc dl(Op); 16245 SDValue R = Op.getOperand(0); 16246 SDValue Amt = Op.getOperand(1); 16247 16248 // Optimize shl/srl/sra with constant shift amount. 16249 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { 16250 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { 16251 uint64_t ShiftAmt = ShiftConst->getZExtValue(); 16252 16253 if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 16254 (Subtarget->hasInt256() && 16255 (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || 16256 (Subtarget->hasAVX512() && 16257 (VT == MVT::v8i64 || VT == MVT::v16i32))) { 16258 if (Op.getOpcode() == ISD::SHL) 16259 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 16260 DAG); 16261 if (Op.getOpcode() == ISD::SRL) 16262 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 16263 DAG); 16264 if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) 16265 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 16266 DAG); 16267 } 16268 16269 if (VT == MVT::v16i8) { 16270 if (Op.getOpcode() == ISD::SHL) { 16271 // Make a large shift. 16272 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 16273 MVT::v8i16, R, ShiftAmt, 16274 DAG); 16275 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 16276 // Zero out the rightmost bits. 16277 SmallVector<SDValue, 16> V(16, 16278 DAG.getConstant(uint8_t(-1U << ShiftAmt), 16279 MVT::i8)); 16280 return DAG.getNode(ISD::AND, dl, VT, SHL, 16281 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 16282 } 16283 if (Op.getOpcode() == ISD::SRL) { 16284 // Make a large shift. 16285 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 16286 MVT::v8i16, R, ShiftAmt, 16287 DAG); 16288 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 16289 // Zero out the leftmost bits. 16290 SmallVector<SDValue, 16> V(16, 16291 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 16292 MVT::i8)); 16293 return DAG.getNode(ISD::AND, dl, VT, SRL, 16294 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 16295 } 16296 if (Op.getOpcode() == ISD::SRA) { 16297 if (ShiftAmt == 7) { 16298 // R s>> 7 === R s< 0 16299 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 16300 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 16301 } 16302 16303 // R s>> a === ((R u>> a) ^ m) - m 16304 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 16305 SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, 16306 MVT::i8)); 16307 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); 16308 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 16309 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 16310 return Res; 16311 } 16312 llvm_unreachable("Unknown shift opcode."); 16313 } 16314 16315 if (Subtarget->hasInt256() && VT == MVT::v32i8) { 16316 if (Op.getOpcode() == ISD::SHL) { 16317 // Make a large shift. 16318 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, 16319 MVT::v16i16, R, ShiftAmt, 16320 DAG); 16321 SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); 16322 // Zero out the rightmost bits. 16323 SmallVector<SDValue, 32> V(32, 16324 DAG.getConstant(uint8_t(-1U << ShiftAmt), 16325 MVT::i8)); 16326 return DAG.getNode(ISD::AND, dl, VT, SHL, 16327 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 16328 } 16329 if (Op.getOpcode() == ISD::SRL) { 16330 // Make a large shift. 16331 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, 16332 MVT::v16i16, R, ShiftAmt, 16333 DAG); 16334 SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); 16335 // Zero out the leftmost bits. 16336 SmallVector<SDValue, 32> V(32, 16337 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, 16338 MVT::i8)); 16339 return DAG.getNode(ISD::AND, dl, VT, SRL, 16340 DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); 16341 } 16342 if (Op.getOpcode() == ISD::SRA) { 16343 if (ShiftAmt == 7) { 16344 // R s>> 7 === R s< 0 16345 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 16346 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); 16347 } 16348 16349 // R s>> a === ((R u>> a) ^ m) - m 16350 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); 16351 SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, 16352 MVT::i8)); 16353 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); 16354 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); 16355 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); 16356 return Res; 16357 } 16358 llvm_unreachable("Unknown shift opcode."); 16359 } 16360 } 16361 } 16362 16363 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 16364 if (!Subtarget->is64Bit() && 16365 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && 16366 Amt.getOpcode() == ISD::BITCAST && 16367 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 16368 Amt = Amt.getOperand(0); 16369 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 16370 VT.getVectorNumElements(); 16371 unsigned RatioInLog2 = Log2_32_Ceil(Ratio); 16372 uint64_t ShiftAmt = 0; 16373 for (unsigned i = 0; i != Ratio; ++i) { 16374 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); 16375 if (!C) 16376 return SDValue(); 16377 // 6 == Log2(64) 16378 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); 16379 } 16380 // Check remaining shift amounts. 16381 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 16382 uint64_t ShAmt = 0; 16383 for (unsigned j = 0; j != Ratio; ++j) { 16384 ConstantSDNode *C = 16385 dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); 16386 if (!C) 16387 return SDValue(); 16388 // 6 == Log2(64) 16389 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); 16390 } 16391 if (ShAmt != ShiftAmt) 16392 return SDValue(); 16393 } 16394 switch (Op.getOpcode()) { 16395 default: 16396 llvm_unreachable("Unknown shift opcode!"); 16397 case ISD::SHL: 16398 return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, 16399 DAG); 16400 case ISD::SRL: 16401 return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, 16402 DAG); 16403 case ISD::SRA: 16404 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, 16405 DAG); 16406 } 16407 } 16408 16409 return SDValue(); 16410 } 16411 16412 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, 16413 const X86Subtarget* Subtarget) { 16414 MVT VT = Op.getSimpleValueType(); 16415 SDLoc dl(Op); 16416 SDValue R = Op.getOperand(0); 16417 SDValue Amt = Op.getOperand(1); 16418 16419 if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || 16420 VT == MVT::v4i32 || VT == MVT::v8i16 || 16421 (Subtarget->hasInt256() && 16422 ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || 16423 VT == MVT::v8i32 || VT == MVT::v16i16)) || 16424 (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { 16425 SDValue BaseShAmt; 16426 EVT EltVT = VT.getVectorElementType(); 16427 16428 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 16429 unsigned NumElts = VT.getVectorNumElements(); 16430 unsigned i, j; 16431 for (i = 0; i != NumElts; ++i) { 16432 if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) 16433 continue; 16434 break; 16435 } 16436 for (j = i; j != NumElts; ++j) { 16437 SDValue Arg = Amt.getOperand(j); 16438 if (Arg.getOpcode() == ISD::UNDEF) continue; 16439 if (Arg != Amt.getOperand(i)) 16440 break; 16441 } 16442 if (i != NumElts && j == NumElts) 16443 BaseShAmt = Amt.getOperand(i); 16444 } else { 16445 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) 16446 Amt = Amt.getOperand(0); 16447 if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && 16448 cast<ShuffleVectorSDNode>(Amt)->isSplat()) { 16449 SDValue InVec = Amt.getOperand(0); 16450 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 16451 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 16452 unsigned i = 0; 16453 for (; i != NumElts; ++i) { 16454 SDValue Arg = InVec.getOperand(i); 16455 if (Arg.getOpcode() == ISD::UNDEF) continue; 16456 BaseShAmt = Arg; 16457 break; 16458 } 16459 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 16460 if (ConstantSDNode *C = 16461 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 16462 unsigned SplatIdx = 16463 cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); 16464 if (C->getZExtValue() == SplatIdx) 16465 BaseShAmt = InVec.getOperand(1); 16466 } 16467 } 16468 if (!BaseShAmt.getNode()) 16469 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, 16470 DAG.getIntPtrConstant(0)); 16471 } 16472 } 16473 16474 if (BaseShAmt.getNode()) { 16475 if (EltVT.bitsGT(MVT::i32)) 16476 BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); 16477 else if (EltVT.bitsLT(MVT::i32)) 16478 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); 16479 16480 switch (Op.getOpcode()) { 16481 default: 16482 llvm_unreachable("Unknown shift opcode!"); 16483 case ISD::SHL: 16484 switch (VT.SimpleTy) { 16485 default: return SDValue(); 16486 case MVT::v2i64: 16487 case MVT::v4i32: 16488 case MVT::v8i16: 16489 case MVT::v4i64: 16490 case MVT::v8i32: 16491 case MVT::v16i16: 16492 case MVT::v16i32: 16493 case MVT::v8i64: 16494 return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); 16495 } 16496 case ISD::SRA: 16497 switch (VT.SimpleTy) { 16498 default: return SDValue(); 16499 case MVT::v4i32: 16500 case MVT::v8i16: 16501 case MVT::v8i32: 16502 case MVT::v16i16: 16503 case MVT::v16i32: 16504 case MVT::v8i64: 16505 return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); 16506 } 16507 case ISD::SRL: 16508 switch (VT.SimpleTy) { 16509 default: return SDValue(); 16510 case MVT::v2i64: 16511 case MVT::v4i32: 16512 case MVT::v8i16: 16513 case MVT::v4i64: 16514 case MVT::v8i32: 16515 case MVT::v16i16: 16516 case MVT::v16i32: 16517 case MVT::v8i64: 16518 return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); 16519 } 16520 } 16521 } 16522 } 16523 16524 // Special case in 32-bit mode, where i64 is expanded into high and low parts. 16525 if (!Subtarget->is64Bit() && 16526 (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || 16527 (Subtarget->hasAVX512() && VT == MVT::v8i64)) && 16528 Amt.getOpcode() == ISD::BITCAST && 16529 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 16530 Amt = Amt.getOperand(0); 16531 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / 16532 VT.getVectorNumElements(); 16533 std::vector<SDValue> Vals(Ratio); 16534 for (unsigned i = 0; i != Ratio; ++i) 16535 Vals[i] = Amt.getOperand(i); 16536 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { 16537 for (unsigned j = 0; j != Ratio; ++j) 16538 if (Vals[j] != Amt.getOperand(i + j)) 16539 return SDValue(); 16540 } 16541 switch (Op.getOpcode()) { 16542 default: 16543 llvm_unreachable("Unknown shift opcode!"); 16544 case ISD::SHL: 16545 return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); 16546 case ISD::SRL: 16547 return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); 16548 case ISD::SRA: 16549 return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); 16550 } 16551 } 16552 16553 return SDValue(); 16554 } 16555 16556 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, 16557 SelectionDAG &DAG) { 16558 MVT VT = Op.getSimpleValueType(); 16559 SDLoc dl(Op); 16560 SDValue R = Op.getOperand(0); 16561 SDValue Amt = Op.getOperand(1); 16562 SDValue V; 16563 16564 assert(VT.isVector() && "Custom lowering only for vector shifts!"); 16565 assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); 16566 16567 V = LowerScalarImmediateShift(Op, DAG, Subtarget); 16568 if (V.getNode()) 16569 return V; 16570 16571 V = LowerScalarVariableShift(Op, DAG, Subtarget); 16572 if (V.getNode()) 16573 return V; 16574 16575 if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) 16576 return Op; 16577 // AVX2 has VPSLLV/VPSRAV/VPSRLV. 16578 if (Subtarget->hasInt256()) { 16579 if (Op.getOpcode() == ISD::SRL && 16580 (VT == MVT::v2i64 || VT == MVT::v4i32 || 16581 VT == MVT::v4i64 || VT == MVT::v8i32)) 16582 return Op; 16583 if (Op.getOpcode() == ISD::SHL && 16584 (VT == MVT::v2i64 || VT == MVT::v4i32 || 16585 VT == MVT::v4i64 || VT == MVT::v8i32)) 16586 return Op; 16587 if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) 16588 return Op; 16589 } 16590 16591 // If possible, lower this packed shift into a vector multiply instead of 16592 // expanding it into a sequence of scalar shifts. 16593 // Do this only if the vector shift count is a constant build_vector. 16594 if (Op.getOpcode() == ISD::SHL && 16595 (VT == MVT::v8i16 || VT == MVT::v4i32 || 16596 (Subtarget->hasInt256() && VT == MVT::v16i16)) && 16597 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 16598 SmallVector<SDValue, 8> Elts; 16599 EVT SVT = VT.getScalarType(); 16600 unsigned SVTBits = SVT.getSizeInBits(); 16601 const APInt &One = APInt(SVTBits, 1); 16602 unsigned NumElems = VT.getVectorNumElements(); 16603 16604 for (unsigned i=0; i !=NumElems; ++i) { 16605 SDValue Op = Amt->getOperand(i); 16606 if (Op->getOpcode() == ISD::UNDEF) { 16607 Elts.push_back(Op); 16608 continue; 16609 } 16610 16611 ConstantSDNode *ND = cast<ConstantSDNode>(Op); 16612 const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); 16613 uint64_t ShAmt = C.getZExtValue(); 16614 if (ShAmt >= SVTBits) { 16615 Elts.push_back(DAG.getUNDEF(SVT)); 16616 continue; 16617 } 16618 Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); 16619 } 16620 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); 16621 return DAG.getNode(ISD::MUL, dl, VT, R, BV); 16622 } 16623 16624 // Lower SHL with variable shift amount. 16625 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 16626 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); 16627 16628 Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); 16629 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 16630 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 16631 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 16632 } 16633 16634 // If possible, lower this shift as a sequence of two shifts by 16635 // constant plus a MOVSS/MOVSD instead of scalarizing it. 16636 // Example: 16637 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) 16638 // 16639 // Could be rewritten as: 16640 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) 16641 // 16642 // The advantage is that the two shifts from the example would be 16643 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing 16644 // the vector shift into four scalar shifts plus four pairs of vector 16645 // insert/extract. 16646 if ((VT == MVT::v8i16 || VT == MVT::v4i32) && 16647 ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { 16648 unsigned TargetOpcode = X86ISD::MOVSS; 16649 bool CanBeSimplified; 16650 // The splat value for the first packed shift (the 'X' from the example). 16651 SDValue Amt1 = Amt->getOperand(0); 16652 // The splat value for the second packed shift (the 'Y' from the example). 16653 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : 16654 Amt->getOperand(2); 16655 16656 // See if it is possible to replace this node with a sequence of 16657 // two shifts followed by a MOVSS/MOVSD 16658 if (VT == MVT::v4i32) { 16659 // Check if it is legal to use a MOVSS. 16660 CanBeSimplified = Amt2 == Amt->getOperand(2) && 16661 Amt2 == Amt->getOperand(3); 16662 if (!CanBeSimplified) { 16663 // Otherwise, check if we can still simplify this node using a MOVSD. 16664 CanBeSimplified = Amt1 == Amt->getOperand(1) && 16665 Amt->getOperand(2) == Amt->getOperand(3); 16666 TargetOpcode = X86ISD::MOVSD; 16667 Amt2 = Amt->getOperand(2); 16668 } 16669 } else { 16670 // Do similar checks for the case where the machine value type 16671 // is MVT::v8i16. 16672 CanBeSimplified = Amt1 == Amt->getOperand(1); 16673 for (unsigned i=3; i != 8 && CanBeSimplified; ++i) 16674 CanBeSimplified = Amt2 == Amt->getOperand(i); 16675 16676 if (!CanBeSimplified) { 16677 TargetOpcode = X86ISD::MOVSD; 16678 CanBeSimplified = true; 16679 Amt2 = Amt->getOperand(4); 16680 for (unsigned i=0; i != 4 && CanBeSimplified; ++i) 16681 CanBeSimplified = Amt1 == Amt->getOperand(i); 16682 for (unsigned j=4; j != 8 && CanBeSimplified; ++j) 16683 CanBeSimplified = Amt2 == Amt->getOperand(j); 16684 } 16685 } 16686 16687 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && 16688 isa<ConstantSDNode>(Amt2)) { 16689 // Replace this node with two shifts followed by a MOVSS/MOVSD. 16690 EVT CastVT = MVT::v4i32; 16691 SDValue Splat1 = 16692 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); 16693 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); 16694 SDValue Splat2 = 16695 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); 16696 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); 16697 if (TargetOpcode == X86ISD::MOVSD) 16698 CastVT = MVT::v2i64; 16699 SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1); 16700 SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2); 16701 SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, 16702 BitCast1, DAG); 16703 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 16704 } 16705 } 16706 16707 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 16708 assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); 16709 16710 // a = a << 5; 16711 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); 16712 Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); 16713 16714 // Turn 'a' into a mask suitable for VSELECT 16715 SDValue VSelM = DAG.getConstant(0x80, VT); 16716 SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 16717 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 16718 16719 SDValue CM1 = DAG.getConstant(0x0f, VT); 16720 SDValue CM2 = DAG.getConstant(0x3f, VT); 16721 16722 // r = VSELECT(r, psllw(r & (char16)15, 4), a); 16723 SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); 16724 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); 16725 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 16726 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 16727 16728 // a += a 16729 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 16730 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 16731 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 16732 16733 // r = VSELECT(r, psllw(r & (char16)63, 2), a); 16734 M = DAG.getNode(ISD::AND, dl, VT, R, CM2); 16735 M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); 16736 M = DAG.getNode(ISD::BITCAST, dl, VT, M); 16737 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); 16738 16739 // a += a 16740 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 16741 OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); 16742 OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); 16743 16744 // return VSELECT(r, r+r, a); 16745 R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, 16746 DAG.getNode(ISD::ADD, dl, VT, R, R), R); 16747 return R; 16748 } 16749 16750 // It's worth extending once and using the v8i32 shifts for 16-bit types, but 16751 // the extra overheads to get from v16i8 to v8i32 make the existing SSE 16752 // solution better. 16753 if (Subtarget->hasInt256() && VT == MVT::v8i16) { 16754 MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; 16755 unsigned ExtOpc = 16756 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 16757 R = DAG.getNode(ExtOpc, dl, NewVT, R); 16758 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); 16759 return DAG.getNode(ISD::TRUNCATE, dl, VT, 16760 DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); 16761 } 16762 16763 // Decompose 256-bit shifts into smaller 128-bit shifts. 16764 if (VT.is256BitVector()) { 16765 unsigned NumElems = VT.getVectorNumElements(); 16766 MVT EltVT = VT.getVectorElementType(); 16767 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 16768 16769 // Extract the two vectors 16770 SDValue V1 = Extract128BitVector(R, 0, DAG, dl); 16771 SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); 16772 16773 // Recreate the shift amount vectors 16774 SDValue Amt1, Amt2; 16775 if (Amt.getOpcode() == ISD::BUILD_VECTOR) { 16776 // Constant shift amount 16777 SmallVector<SDValue, 4> Amt1Csts; 16778 SmallVector<SDValue, 4> Amt2Csts; 16779 for (unsigned i = 0; i != NumElems/2; ++i) 16780 Amt1Csts.push_back(Amt->getOperand(i)); 16781 for (unsigned i = NumElems/2; i != NumElems; ++i) 16782 Amt2Csts.push_back(Amt->getOperand(i)); 16783 16784 Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); 16785 Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); 16786 } else { 16787 // Variable shift amount 16788 Amt1 = Extract128BitVector(Amt, 0, DAG, dl); 16789 Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); 16790 } 16791 16792 // Issue new vector shifts for the smaller types 16793 V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); 16794 V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); 16795 16796 // Concatenate the result back 16797 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); 16798 } 16799 16800 return SDValue(); 16801 } 16802 16803 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 16804 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 16805 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 16806 // looks for this combo and may remove the "setcc" instruction if the "setcc" 16807 // has only one use. 16808 SDNode *N = Op.getNode(); 16809 SDValue LHS = N->getOperand(0); 16810 SDValue RHS = N->getOperand(1); 16811 unsigned BaseOp = 0; 16812 unsigned Cond = 0; 16813 SDLoc DL(Op); 16814 switch (Op.getOpcode()) { 16815 default: llvm_unreachable("Unknown ovf instruction!"); 16816 case ISD::SADDO: 16817 // A subtract of one will be selected as a INC. Note that INC doesn't 16818 // set CF, so we can't do this for UADDO. 16819 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 16820 if (C->isOne()) { 16821 BaseOp = X86ISD::INC; 16822 Cond = X86::COND_O; 16823 break; 16824 } 16825 BaseOp = X86ISD::ADD; 16826 Cond = X86::COND_O; 16827 break; 16828 case ISD::UADDO: 16829 BaseOp = X86ISD::ADD; 16830 Cond = X86::COND_B; 16831 break; 16832 case ISD::SSUBO: 16833 // A subtract of one will be selected as a DEC. Note that DEC doesn't 16834 // set CF, so we can't do this for USUBO. 16835 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 16836 if (C->isOne()) { 16837 BaseOp = X86ISD::DEC; 16838 Cond = X86::COND_O; 16839 break; 16840 } 16841 BaseOp = X86ISD::SUB; 16842 Cond = X86::COND_O; 16843 break; 16844 case ISD::USUBO: 16845 BaseOp = X86ISD::SUB; 16846 Cond = X86::COND_B; 16847 break; 16848 case ISD::SMULO: 16849 BaseOp = X86ISD::SMUL; 16850 Cond = X86::COND_O; 16851 break; 16852 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 16853 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 16854 MVT::i32); 16855 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 16856 16857 SDValue SetCC = 16858 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 16859 DAG.getConstant(X86::COND_O, MVT::i32), 16860 SDValue(Sum.getNode(), 2)); 16861 16862 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 16863 } 16864 } 16865 16866 // Also sets EFLAGS. 16867 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 16868 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 16869 16870 SDValue SetCC = 16871 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 16872 DAG.getConstant(Cond, MVT::i32), 16873 SDValue(Sum.getNode(), 1)); 16874 16875 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 16876 } 16877 16878 // Sign extension of the low part of vector elements. This may be used either 16879 // when sign extend instructions are not available or if the vector element 16880 // sizes already match the sign-extended size. If the vector elements are in 16881 // their pre-extended size and sign extend instructions are available, that will 16882 // be handled by LowerSIGN_EXTEND. 16883 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 16884 SelectionDAG &DAG) const { 16885 SDLoc dl(Op); 16886 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 16887 MVT VT = Op.getSimpleValueType(); 16888 16889 if (!Subtarget->hasSSE2() || !VT.isVector()) 16890 return SDValue(); 16891 16892 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 16893 ExtraVT.getScalarType().getSizeInBits(); 16894 16895 switch (VT.SimpleTy) { 16896 default: return SDValue(); 16897 case MVT::v8i32: 16898 case MVT::v16i16: 16899 if (!Subtarget->hasFp256()) 16900 return SDValue(); 16901 if (!Subtarget->hasInt256()) { 16902 // needs to be split 16903 unsigned NumElems = VT.getVectorNumElements(); 16904 16905 // Extract the LHS vectors 16906 SDValue LHS = Op.getOperand(0); 16907 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); 16908 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); 16909 16910 MVT EltVT = VT.getVectorElementType(); 16911 EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); 16912 16913 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 16914 unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); 16915 ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, 16916 ExtraNumElems/2); 16917 SDValue Extra = DAG.getValueType(ExtraVT); 16918 16919 LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); 16920 LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); 16921 16922 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); 16923 } 16924 // fall through 16925 case MVT::v4i32: 16926 case MVT::v8i16: { 16927 SDValue Op0 = Op.getOperand(0); 16928 16929 // This is a sign extension of some low part of vector elements without 16930 // changing the size of the vector elements themselves: 16931 // Shift-Left + Shift-Right-Algebraic. 16932 SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, 16933 BitsDiff, DAG); 16934 return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, 16935 DAG); 16936 } 16937 } 16938 } 16939 16940 /// Returns true if the operand type is exactly twice the native width, and 16941 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. 16942 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations 16943 /// (otherwise we leave them alone to become __sync_fetch_and_... calls). 16944 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { 16945 const X86Subtarget &Subtarget = 16946 getTargetMachine().getSubtarget<X86Subtarget>(); 16947 unsigned OpWidth = MemType->getPrimitiveSizeInBits(); 16948 16949 if (OpWidth == 64) 16950 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b 16951 else if (OpWidth == 128) 16952 return Subtarget.hasCmpxchg16b(); 16953 else 16954 return false; 16955 } 16956 16957 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16958 return needsCmpXchgNb(SI->getValueOperand()->getType()); 16959 } 16960 16961 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const { 16962 return false; // FIXME, currently these are expanded separately in this file. 16963 } 16964 16965 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 16966 const X86Subtarget &Subtarget = 16967 getTargetMachine().getSubtarget<X86Subtarget>(); 16968 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; 16969 const Type *MemType = AI->getType(); 16970 16971 // If the operand is too big, we must see if cmpxchg8/16b is available 16972 // and default to library calls otherwise. 16973 if (MemType->getPrimitiveSizeInBits() > NativeWidth) 16974 return needsCmpXchgNb(MemType); 16975 16976 AtomicRMWInst::BinOp Op = AI->getOperation(); 16977 switch (Op) { 16978 default: 16979 llvm_unreachable("Unknown atomic operation"); 16980 case AtomicRMWInst::Xchg: 16981 case AtomicRMWInst::Add: 16982 case AtomicRMWInst::Sub: 16983 // It's better to use xadd, xsub or xchg for these in all cases. 16984 return false; 16985 case AtomicRMWInst::Or: 16986 case AtomicRMWInst::And: 16987 case AtomicRMWInst::Xor: 16988 // If the atomicrmw's result isn't actually used, we can just add a "lock" 16989 // prefix to a normal instruction for these operations. 16990 return !AI->use_empty(); 16991 case AtomicRMWInst::Nand: 16992 case AtomicRMWInst::Max: 16993 case AtomicRMWInst::Min: 16994 case AtomicRMWInst::UMax: 16995 case AtomicRMWInst::UMin: 16996 // These always require a non-trivial set of data operations on x86. We must 16997 // use a cmpxchg loop. 16998 return true; 16999 } 17000 } 17001 17002 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, 17003 SelectionDAG &DAG) { 17004 SDLoc dl(Op); 17005 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 17006 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 17007 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 17008 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 17009 17010 // The only fence that needs an instruction is a sequentially-consistent 17011 // cross-thread fence. 17012 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 17013 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 17014 // no-sse2). There isn't any reason to disable it if the target processor 17015 // supports it. 17016 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 17017 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 17018 17019 SDValue Chain = Op.getOperand(0); 17020 SDValue Zero = DAG.getConstant(0, MVT::i32); 17021 SDValue Ops[] = { 17022 DAG.getRegister(X86::ESP, MVT::i32), // Base 17023 DAG.getTargetConstant(1, MVT::i8), // Scale 17024 DAG.getRegister(0, MVT::i32), // Index 17025 DAG.getTargetConstant(0, MVT::i32), // Disp 17026 DAG.getRegister(0, MVT::i32), // Segment. 17027 Zero, 17028 Chain 17029 }; 17030 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); 17031 return SDValue(Res, 0); 17032 } 17033 17034 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 17035 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 17036 } 17037 17038 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, 17039 SelectionDAG &DAG) { 17040 MVT T = Op.getSimpleValueType(); 17041 SDLoc DL(Op); 17042 unsigned Reg = 0; 17043 unsigned size = 0; 17044 switch(T.SimpleTy) { 17045 default: llvm_unreachable("Invalid value type!"); 17046 case MVT::i8: Reg = X86::AL; size = 1; break; 17047 case MVT::i16: Reg = X86::AX; size = 2; break; 17048 case MVT::i32: Reg = X86::EAX; size = 4; break; 17049 case MVT::i64: 17050 assert(Subtarget->is64Bit() && "Node not type legal!"); 17051 Reg = X86::RAX; size = 8; 17052 break; 17053 } 17054 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 17055 Op.getOperand(2), SDValue()); 17056 SDValue Ops[] = { cpIn.getValue(0), 17057 Op.getOperand(1), 17058 Op.getOperand(3), 17059 DAG.getTargetConstant(size, MVT::i8), 17060 cpIn.getValue(1) }; 17061 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 17062 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 17063 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 17064 Ops, T, MMO); 17065 17066 SDValue cpOut = 17067 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 17068 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, 17069 MVT::i32, cpOut.getValue(2)); 17070 SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), 17071 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); 17072 17073 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); 17074 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); 17075 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); 17076 return SDValue(); 17077 } 17078 17079 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, 17080 SelectionDAG &DAG) { 17081 MVT SrcVT = Op.getOperand(0).getSimpleValueType(); 17082 MVT DstVT = Op.getSimpleValueType(); 17083 17084 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { 17085 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 17086 if (DstVT != MVT::f64) 17087 // This conversion needs to be expanded. 17088 return SDValue(); 17089 17090 SDValue InVec = Op->getOperand(0); 17091 SDLoc dl(Op); 17092 unsigned NumElts = SrcVT.getVectorNumElements(); 17093 EVT SVT = SrcVT.getVectorElementType(); 17094 17095 // Widen the vector in input in the case of MVT::v2i32. 17096 // Example: from MVT::v2i32 to MVT::v4i32. 17097 SmallVector<SDValue, 16> Elts; 17098 for (unsigned i = 0, e = NumElts; i != e; ++i) 17099 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, 17100 DAG.getIntPtrConstant(i))); 17101 17102 // Explicitly mark the extra elements as Undef. 17103 SDValue Undef = DAG.getUNDEF(SVT); 17104 for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i) 17105 Elts.push_back(Undef); 17106 17107 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 17108 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); 17109 SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); 17110 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, 17111 DAG.getIntPtrConstant(0)); 17112 } 17113 17114 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 17115 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 17116 assert((DstVT == MVT::i64 || 17117 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 17118 "Unexpected custom BITCAST"); 17119 // i64 <=> MMX conversions are Legal. 17120 if (SrcVT==MVT::i64 && DstVT.isVector()) 17121 return Op; 17122 if (DstVT==MVT::i64 && SrcVT.isVector()) 17123 return Op; 17124 // MMX <=> MMX conversions are Legal. 17125 if (SrcVT.isVector() && DstVT.isVector()) 17126 return Op; 17127 // All other conversions need to be expanded. 17128 return SDValue(); 17129 } 17130 17131 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 17132 SDNode *Node = Op.getNode(); 17133 SDLoc dl(Node); 17134 EVT T = Node->getValueType(0); 17135 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 17136 DAG.getConstant(0, T), Node->getOperand(2)); 17137 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 17138 cast<AtomicSDNode>(Node)->getMemoryVT(), 17139 Node->getOperand(0), 17140 Node->getOperand(1), negOp, 17141 cast<AtomicSDNode>(Node)->getMemOperand(), 17142 cast<AtomicSDNode>(Node)->getOrdering(), 17143 cast<AtomicSDNode>(Node)->getSynchScope()); 17144 } 17145 17146 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { 17147 SDNode *Node = Op.getNode(); 17148 SDLoc dl(Node); 17149 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 17150 17151 // Convert seq_cst store -> xchg 17152 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) 17153 // FIXME: On 32-bit, store -> fist or movq would be more efficient 17154 // (The only way to get a 16-byte store is cmpxchg16b) 17155 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. 17156 if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || 17157 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 17158 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, 17159 cast<AtomicSDNode>(Node)->getMemoryVT(), 17160 Node->getOperand(0), 17161 Node->getOperand(1), Node->getOperand(2), 17162 cast<AtomicSDNode>(Node)->getMemOperand(), 17163 cast<AtomicSDNode>(Node)->getOrdering(), 17164 cast<AtomicSDNode>(Node)->getSynchScope()); 17165 return Swap.getValue(1); 17166 } 17167 // Other atomic stores have a simple pattern. 17168 return Op; 17169 } 17170 17171 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 17172 EVT VT = Op.getNode()->getSimpleValueType(0); 17173 17174 // Let legalize expand this if it isn't a legal type yet. 17175 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 17176 return SDValue(); 17177 17178 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 17179 17180 unsigned Opc; 17181 bool ExtraOp = false; 17182 switch (Op.getOpcode()) { 17183 default: llvm_unreachable("Invalid code"); 17184 case ISD::ADDC: Opc = X86ISD::ADD; break; 17185 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 17186 case ISD::SUBC: Opc = X86ISD::SUB; break; 17187 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 17188 } 17189 17190 if (!ExtraOp) 17191 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 17192 Op.getOperand(1)); 17193 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), 17194 Op.getOperand(1), Op.getOperand(2)); 17195 } 17196 17197 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, 17198 SelectionDAG &DAG) { 17199 assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); 17200 17201 // For MacOSX, we want to call an alternative entry point: __sincos_stret, 17202 // which returns the values as { float, float } (in XMM0) or 17203 // { double, double } (which is returned in XMM0, XMM1). 17204 SDLoc dl(Op); 17205 SDValue Arg = Op.getOperand(0); 17206 EVT ArgVT = Arg.getValueType(); 17207 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 17208 17209 TargetLowering::ArgListTy Args; 17210 TargetLowering::ArgListEntry Entry; 17211 17212 Entry.Node = Arg; 17213 Entry.Ty = ArgTy; 17214 Entry.isSExt = false; 17215 Entry.isZExt = false; 17216 Args.push_back(Entry); 17217 17218 bool isF64 = ArgVT == MVT::f64; 17219 // Only optimize x86_64 for now. i386 is a bit messy. For f32, 17220 // the small struct {f32, f32} is returned in (eax, edx). For f64, 17221 // the results are returned via SRet in memory. 17222 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; 17223 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17224 SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); 17225 17226 Type *RetTy = isF64 17227 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) 17228 : (Type*)VectorType::get(ArgTy, 4); 17229 17230 TargetLowering::CallLoweringInfo CLI(DAG); 17231 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 17232 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); 17233 17234 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); 17235 17236 if (isF64) 17237 // Returned in xmm0 and xmm1. 17238 return CallResult.first; 17239 17240 // Returned in bits 0:31 and 32:64 xmm0. 17241 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 17242 CallResult.first, DAG.getIntPtrConstant(0)); 17243 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, 17244 CallResult.first, DAG.getIntPtrConstant(1)); 17245 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 17246 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); 17247 } 17248 17249 /// LowerOperation - Provide custom lowering hooks for some operations. 17250 /// 17251 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 17252 switch (Op.getOpcode()) { 17253 default: llvm_unreachable("Should not custom lower this!"); 17254 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 17255 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); 17256 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 17257 return LowerCMP_SWAP(Op, Subtarget, DAG); 17258 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 17259 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); 17260 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 17261 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 17262 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 17263 case ISD::VSELECT: return LowerVSELECT(Op, DAG); 17264 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 17265 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 17266 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); 17267 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); 17268 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 17269 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 17270 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 17271 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 17272 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 17273 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 17274 case ISD::SHL_PARTS: 17275 case ISD::SRA_PARTS: 17276 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 17277 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 17278 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 17279 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 17280 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); 17281 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); 17282 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); 17283 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 17284 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 17285 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 17286 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); 17287 case ISD::FABS: 17288 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); 17289 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 17290 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 17291 case ISD::SETCC: return LowerSETCC(Op, DAG); 17292 case ISD::SELECT: return LowerSELECT(Op, DAG); 17293 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 17294 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 17295 case ISD::VASTART: return LowerVASTART(Op, DAG); 17296 case ISD::VAARG: return LowerVAARG(Op, DAG); 17297 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); 17298 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 17299 case ISD::INTRINSIC_VOID: 17300 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); 17301 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 17302 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 17303 case ISD::FRAME_TO_ARGS_OFFSET: 17304 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 17305 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 17306 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 17307 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 17308 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 17309 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 17310 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 17311 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 17312 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 17313 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); 17314 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 17315 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); 17316 case ISD::UMUL_LOHI: 17317 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); 17318 case ISD::SRA: 17319 case ISD::SRL: 17320 case ISD::SHL: return LowerShift(Op, Subtarget, DAG); 17321 case ISD::SADDO: 17322 case ISD::UADDO: 17323 case ISD::SSUBO: 17324 case ISD::USUBO: 17325 case ISD::SMULO: 17326 case ISD::UMULO: return LowerXALUO(Op, DAG); 17327 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); 17328 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); 17329 case ISD::ADDC: 17330 case ISD::ADDE: 17331 case ISD::SUBC: 17332 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 17333 case ISD::ADD: return LowerADD(Op, DAG); 17334 case ISD::SUB: return LowerSUB(Op, DAG); 17335 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); 17336 } 17337 } 17338 17339 static void ReplaceATOMIC_LOAD(SDNode *Node, 17340 SmallVectorImpl<SDValue> &Results, 17341 SelectionDAG &DAG) { 17342 SDLoc dl(Node); 17343 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); 17344 17345 // Convert wide load -> cmpxchg8b/cmpxchg16b 17346 // FIXME: On 32-bit, load -> fild or movq would be more efficient 17347 // (The only way to get a 16-byte load is cmpxchg16b) 17348 // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. 17349 SDValue Zero = DAG.getConstant(0, VT); 17350 SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); 17351 SDValue Swap = 17352 DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, 17353 Node->getOperand(0), Node->getOperand(1), Zero, Zero, 17354 cast<AtomicSDNode>(Node)->getMemOperand(), 17355 cast<AtomicSDNode>(Node)->getOrdering(), 17356 cast<AtomicSDNode>(Node)->getOrdering(), 17357 cast<AtomicSDNode>(Node)->getSynchScope()); 17358 Results.push_back(Swap.getValue(0)); 17359 Results.push_back(Swap.getValue(2)); 17360 } 17361 17362 /// ReplaceNodeResults - Replace a node with an illegal result type 17363 /// with a new node built out of custom code. 17364 void X86TargetLowering::ReplaceNodeResults(SDNode *N, 17365 SmallVectorImpl<SDValue>&Results, 17366 SelectionDAG &DAG) const { 17367 SDLoc dl(N); 17368 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 17369 switch (N->getOpcode()) { 17370 default: 17371 llvm_unreachable("Do not know how to custom type legalize this operation!"); 17372 case ISD::SIGN_EXTEND_INREG: 17373 case ISD::ADDC: 17374 case ISD::ADDE: 17375 case ISD::SUBC: 17376 case ISD::SUBE: 17377 // We don't want to expand or promote these. 17378 return; 17379 case ISD::SDIV: 17380 case ISD::UDIV: 17381 case ISD::SREM: 17382 case ISD::UREM: 17383 case ISD::SDIVREM: 17384 case ISD::UDIVREM: { 17385 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); 17386 Results.push_back(V); 17387 return; 17388 } 17389 case ISD::FP_TO_SINT: 17390 case ISD::FP_TO_UINT: { 17391 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 17392 17393 if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) 17394 return; 17395 17396 std::pair<SDValue,SDValue> Vals = 17397 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); 17398 SDValue FIST = Vals.first, StackSlot = Vals.second; 17399 if (FIST.getNode()) { 17400 EVT VT = N->getValueType(0); 17401 // Return a load from the stack slot. 17402 if (StackSlot.getNode()) 17403 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 17404 MachinePointerInfo(), 17405 false, false, false, 0)); 17406 else 17407 Results.push_back(FIST); 17408 } 17409 return; 17410 } 17411 case ISD::UINT_TO_FP: { 17412 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 17413 if (N->getOperand(0).getValueType() != MVT::v2i32 || 17414 N->getValueType(0) != MVT::v2f32) 17415 return; 17416 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, 17417 N->getOperand(0)); 17418 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 17419 MVT::f64); 17420 SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); 17421 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, 17422 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); 17423 Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); 17424 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); 17425 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); 17426 return; 17427 } 17428 case ISD::FP_ROUND: { 17429 if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) 17430 return; 17431 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); 17432 Results.push_back(V); 17433 return; 17434 } 17435 case ISD::INTRINSIC_W_CHAIN: { 17436 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 17437 switch (IntNo) { 17438 default : llvm_unreachable("Do not know how to custom type " 17439 "legalize this intrinsic operation!"); 17440 case Intrinsic::x86_rdtsc: 17441 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 17442 Results); 17443 case Intrinsic::x86_rdtscp: 17444 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, 17445 Results); 17446 case Intrinsic::x86_rdpmc: 17447 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); 17448 } 17449 } 17450 case ISD::READCYCLECOUNTER: { 17451 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, 17452 Results); 17453 } 17454 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { 17455 EVT T = N->getValueType(0); 17456 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); 17457 bool Regs64bit = T == MVT::i128; 17458 EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; 17459 SDValue cpInL, cpInH; 17460 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 17461 DAG.getConstant(0, HalfT)); 17462 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), 17463 DAG.getConstant(1, HalfT)); 17464 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, 17465 Regs64bit ? X86::RAX : X86::EAX, 17466 cpInL, SDValue()); 17467 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, 17468 Regs64bit ? X86::RDX : X86::EDX, 17469 cpInH, cpInL.getValue(1)); 17470 SDValue swapInL, swapInH; 17471 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 17472 DAG.getConstant(0, HalfT)); 17473 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), 17474 DAG.getConstant(1, HalfT)); 17475 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, 17476 Regs64bit ? X86::RBX : X86::EBX, 17477 swapInL, cpInH.getValue(1)); 17478 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, 17479 Regs64bit ? X86::RCX : X86::ECX, 17480 swapInH, swapInL.getValue(1)); 17481 SDValue Ops[] = { swapInH.getValue(0), 17482 N->getOperand(1), 17483 swapInH.getValue(1) }; 17484 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 17485 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 17486 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : 17487 X86ISD::LCMPXCHG8_DAG; 17488 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); 17489 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, 17490 Regs64bit ? X86::RAX : X86::EAX, 17491 HalfT, Result.getValue(1)); 17492 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, 17493 Regs64bit ? X86::RDX : X86::EDX, 17494 HalfT, cpOutL.getValue(2)); 17495 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 17496 17497 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, 17498 MVT::i32, cpOutH.getValue(2)); 17499 SDValue Success = 17500 DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 17501 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); 17502 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); 17503 17504 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); 17505 Results.push_back(Success); 17506 Results.push_back(EFLAGS.getValue(1)); 17507 return; 17508 } 17509 case ISD::ATOMIC_SWAP: 17510 case ISD::ATOMIC_LOAD_ADD: 17511 case ISD::ATOMIC_LOAD_SUB: 17512 case ISD::ATOMIC_LOAD_AND: 17513 case ISD::ATOMIC_LOAD_OR: 17514 case ISD::ATOMIC_LOAD_XOR: 17515 case ISD::ATOMIC_LOAD_NAND: 17516 case ISD::ATOMIC_LOAD_MIN: 17517 case ISD::ATOMIC_LOAD_MAX: 17518 case ISD::ATOMIC_LOAD_UMIN: 17519 case ISD::ATOMIC_LOAD_UMAX: 17520 // Delegate to generic TypeLegalization. Situations we can really handle 17521 // should have already been dealt with by AtomicExpandPass.cpp. 17522 break; 17523 case ISD::ATOMIC_LOAD: { 17524 ReplaceATOMIC_LOAD(N, Results, DAG); 17525 return; 17526 } 17527 case ISD::BITCAST: { 17528 assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); 17529 EVT DstVT = N->getValueType(0); 17530 EVT SrcVT = N->getOperand(0)->getValueType(0); 17531 17532 if (SrcVT != MVT::f64 || 17533 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) 17534 return; 17535 17536 unsigned NumElts = DstVT.getVectorNumElements(); 17537 EVT SVT = DstVT.getVectorElementType(); 17538 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); 17539 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 17540 MVT::v2f64, N->getOperand(0)); 17541 SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded); 17542 17543 if (ExperimentalVectorWideningLegalization) { 17544 // If we are legalizing vectors by widening, we already have the desired 17545 // legal vector type, just return it. 17546 Results.push_back(ToVecInt); 17547 return; 17548 } 17549 17550 SmallVector<SDValue, 8> Elts; 17551 for (unsigned i = 0, e = NumElts; i != e; ++i) 17552 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, 17553 ToVecInt, DAG.getIntPtrConstant(i))); 17554 17555 Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); 17556 } 17557 } 17558 } 17559 17560 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 17561 switch (Opcode) { 17562 default: return nullptr; 17563 case X86ISD::BSF: return "X86ISD::BSF"; 17564 case X86ISD::BSR: return "X86ISD::BSR"; 17565 case X86ISD::SHLD: return "X86ISD::SHLD"; 17566 case X86ISD::SHRD: return "X86ISD::SHRD"; 17567 case X86ISD::FAND: return "X86ISD::FAND"; 17568 case X86ISD::FANDN: return "X86ISD::FANDN"; 17569 case X86ISD::FOR: return "X86ISD::FOR"; 17570 case X86ISD::FXOR: return "X86ISD::FXOR"; 17571 case X86ISD::FSRL: return "X86ISD::FSRL"; 17572 case X86ISD::FILD: return "X86ISD::FILD"; 17573 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 17574 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 17575 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 17576 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 17577 case X86ISD::FLD: return "X86ISD::FLD"; 17578 case X86ISD::FST: return "X86ISD::FST"; 17579 case X86ISD::CALL: return "X86ISD::CALL"; 17580 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 17581 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; 17582 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; 17583 case X86ISD::BT: return "X86ISD::BT"; 17584 case X86ISD::CMP: return "X86ISD::CMP"; 17585 case X86ISD::COMI: return "X86ISD::COMI"; 17586 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 17587 case X86ISD::CMPM: return "X86ISD::CMPM"; 17588 case X86ISD::CMPMU: return "X86ISD::CMPMU"; 17589 case X86ISD::SETCC: return "X86ISD::SETCC"; 17590 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 17591 case X86ISD::FSETCC: return "X86ISD::FSETCC"; 17592 case X86ISD::CMOV: return "X86ISD::CMOV"; 17593 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 17594 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 17595 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 17596 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 17597 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 17598 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 17599 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 17600 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 17601 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 17602 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 17603 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 17604 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 17605 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 17606 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 17607 case X86ISD::PSIGN: return "X86ISD::PSIGN"; 17608 case X86ISD::BLENDV: return "X86ISD::BLENDV"; 17609 case X86ISD::BLENDI: return "X86ISD::BLENDI"; 17610 case X86ISD::SUBUS: return "X86ISD::SUBUS"; 17611 case X86ISD::HADD: return "X86ISD::HADD"; 17612 case X86ISD::HSUB: return "X86ISD::HSUB"; 17613 case X86ISD::FHADD: return "X86ISD::FHADD"; 17614 case X86ISD::FHSUB: return "X86ISD::FHSUB"; 17615 case X86ISD::UMAX: return "X86ISD::UMAX"; 17616 case X86ISD::UMIN: return "X86ISD::UMIN"; 17617 case X86ISD::SMAX: return "X86ISD::SMAX"; 17618 case X86ISD::SMIN: return "X86ISD::SMIN"; 17619 case X86ISD::FMAX: return "X86ISD::FMAX"; 17620 case X86ISD::FMIN: return "X86ISD::FMIN"; 17621 case X86ISD::FMAXC: return "X86ISD::FMAXC"; 17622 case X86ISD::FMINC: return "X86ISD::FMINC"; 17623 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 17624 case X86ISD::FRCP: return "X86ISD::FRCP"; 17625 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 17626 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; 17627 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 17628 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; 17629 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; 17630 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 17631 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 17632 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 17633 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; 17634 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 17635 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 17636 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; 17637 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 17638 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 17639 case X86ISD::VZEXT: return "X86ISD::VZEXT"; 17640 case X86ISD::VSEXT: return "X86ISD::VSEXT"; 17641 case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; 17642 case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; 17643 case X86ISD::VINSERT: return "X86ISD::VINSERT"; 17644 case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; 17645 case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; 17646 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; 17647 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; 17648 case X86ISD::VSHL: return "X86ISD::VSHL"; 17649 case X86ISD::VSRL: return "X86ISD::VSRL"; 17650 case X86ISD::VSRA: return "X86ISD::VSRA"; 17651 case X86ISD::VSHLI: return "X86ISD::VSHLI"; 17652 case X86ISD::VSRLI: return "X86ISD::VSRLI"; 17653 case X86ISD::VSRAI: return "X86ISD::VSRAI"; 17654 case X86ISD::CMPP: return "X86ISD::CMPP"; 17655 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; 17656 case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; 17657 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; 17658 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; 17659 case X86ISD::ADD: return "X86ISD::ADD"; 17660 case X86ISD::SUB: return "X86ISD::SUB"; 17661 case X86ISD::ADC: return "X86ISD::ADC"; 17662 case X86ISD::SBB: return "X86ISD::SBB"; 17663 case X86ISD::SMUL: return "X86ISD::SMUL"; 17664 case X86ISD::UMUL: return "X86ISD::UMUL"; 17665 case X86ISD::INC: return "X86ISD::INC"; 17666 case X86ISD::DEC: return "X86ISD::DEC"; 17667 case X86ISD::OR: return "X86ISD::OR"; 17668 case X86ISD::XOR: return "X86ISD::XOR"; 17669 case X86ISD::AND: return "X86ISD::AND"; 17670 case X86ISD::BEXTR: return "X86ISD::BEXTR"; 17671 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 17672 case X86ISD::PTEST: return "X86ISD::PTEST"; 17673 case X86ISD::TESTP: return "X86ISD::TESTP"; 17674 case X86ISD::TESTM: return "X86ISD::TESTM"; 17675 case X86ISD::TESTNM: return "X86ISD::TESTNM"; 17676 case X86ISD::KORTEST: return "X86ISD::KORTEST"; 17677 case X86ISD::PACKSS: return "X86ISD::PACKSS"; 17678 case X86ISD::PACKUS: return "X86ISD::PACKUS"; 17679 case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; 17680 case X86ISD::VALIGN: return "X86ISD::VALIGN"; 17681 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 17682 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 17683 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 17684 case X86ISD::SHUFP: return "X86ISD::SHUFP"; 17685 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 17686 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 17687 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 17688 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 17689 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 17690 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 17691 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 17692 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 17693 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 17694 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 17695 case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; 17696 case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; 17697 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; 17698 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; 17699 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; 17700 case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; 17701 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; 17702 case X86ISD::VPERMV: return "X86ISD::VPERMV"; 17703 case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; 17704 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; 17705 case X86ISD::VPERMI: return "X86ISD::VPERMI"; 17706 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; 17707 case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; 17708 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 17709 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 17710 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 17711 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 17712 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; 17713 case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; 17714 case X86ISD::SAHF: return "X86ISD::SAHF"; 17715 case X86ISD::RDRAND: return "X86ISD::RDRAND"; 17716 case X86ISD::RDSEED: return "X86ISD::RDSEED"; 17717 case X86ISD::FMADD: return "X86ISD::FMADD"; 17718 case X86ISD::FMSUB: return "X86ISD::FMSUB"; 17719 case X86ISD::FNMADD: return "X86ISD::FNMADD"; 17720 case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; 17721 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; 17722 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; 17723 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; 17724 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; 17725 case X86ISD::XTEST: return "X86ISD::XTEST"; 17726 } 17727 } 17728 17729 // isLegalAddressingMode - Return true if the addressing mode represented 17730 // by AM is legal for this target, for a load/store of the specified type. 17731 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 17732 Type *Ty) const { 17733 // X86 supports extremely general addressing modes. 17734 CodeModel::Model M = getTargetMachine().getCodeModel(); 17735 Reloc::Model R = getTargetMachine().getRelocationModel(); 17736 17737 // X86 allows a sign-extended 32-bit immediate field as a displacement. 17738 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) 17739 return false; 17740 17741 if (AM.BaseGV) { 17742 unsigned GVFlags = 17743 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 17744 17745 // If a reference to this global requires an extra load, we can't fold it. 17746 if (isGlobalStubReference(GVFlags)) 17747 return false; 17748 17749 // If BaseGV requires a register for the PIC base, we cannot also have a 17750 // BaseReg specified. 17751 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 17752 return false; 17753 17754 // If lower 4G is not available, then we must use rip-relative addressing. 17755 if ((M != CodeModel::Small || R != Reloc::Static) && 17756 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 17757 return false; 17758 } 17759 17760 switch (AM.Scale) { 17761 case 0: 17762 case 1: 17763 case 2: 17764 case 4: 17765 case 8: 17766 // These scales always work. 17767 break; 17768 case 3: 17769 case 5: 17770 case 9: 17771 // These scales are formed with basereg+scalereg. Only accept if there is 17772 // no basereg yet. 17773 if (AM.HasBaseReg) 17774 return false; 17775 break; 17776 default: // Other stuff never works. 17777 return false; 17778 } 17779 17780 return true; 17781 } 17782 17783 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { 17784 unsigned Bits = Ty->getScalarSizeInBits(); 17785 17786 // 8-bit shifts are always expensive, but versions with a scalar amount aren't 17787 // particularly cheaper than those without. 17788 if (Bits == 8) 17789 return false; 17790 17791 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make 17792 // variable shifts just as cheap as scalar ones. 17793 if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) 17794 return false; 17795 17796 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a 17797 // fully general vector. 17798 return true; 17799 } 17800 17801 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 17802 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 17803 return false; 17804 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 17805 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 17806 return NumBits1 > NumBits2; 17807 } 17808 17809 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 17810 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 17811 return false; 17812 17813 if (!isTypeLegal(EVT::getEVT(Ty1))) 17814 return false; 17815 17816 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 17817 17818 // Assuming the caller doesn't have a zeroext or signext return parameter, 17819 // truncation all the way down to i1 is valid. 17820 return true; 17821 } 17822 17823 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { 17824 return isInt<32>(Imm); 17825 } 17826 17827 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { 17828 // Can also use sub to handle negated immediates. 17829 return isInt<32>(Imm); 17830 } 17831 17832 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 17833 if (!VT1.isInteger() || !VT2.isInteger()) 17834 return false; 17835 unsigned NumBits1 = VT1.getSizeInBits(); 17836 unsigned NumBits2 = VT2.getSizeInBits(); 17837 return NumBits1 > NumBits2; 17838 } 17839 17840 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 17841 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 17842 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 17843 } 17844 17845 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 17846 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 17847 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 17848 } 17849 17850 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 17851 EVT VT1 = Val.getValueType(); 17852 if (isZExtFree(VT1, VT2)) 17853 return true; 17854 17855 if (Val.getOpcode() != ISD::LOAD) 17856 return false; 17857 17858 if (!VT1.isSimple() || !VT1.isInteger() || 17859 !VT2.isSimple() || !VT2.isInteger()) 17860 return false; 17861 17862 switch (VT1.getSimpleVT().SimpleTy) { 17863 default: break; 17864 case MVT::i8: 17865 case MVT::i16: 17866 case MVT::i32: 17867 // X86 has 8, 16, and 32-bit zero-extending loads. 17868 return true; 17869 } 17870 17871 return false; 17872 } 17873 17874 bool 17875 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 17876 if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) 17877 return false; 17878 17879 VT = VT.getScalarType(); 17880 17881 if (!VT.isSimple()) 17882 return false; 17883 17884 switch (VT.getSimpleVT().SimpleTy) { 17885 case MVT::f32: 17886 case MVT::f64: 17887 return true; 17888 default: 17889 break; 17890 } 17891 17892 return false; 17893 } 17894 17895 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 17896 // i16 instructions are longer (0x66 prefix) and potentially slower. 17897 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 17898 } 17899 17900 /// isShuffleMaskLegal - Targets can use this to indicate that they only 17901 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 17902 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 17903 /// are assumed to be legal. 17904 bool 17905 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 17906 EVT VT) const { 17907 if (!VT.isSimple()) 17908 return false; 17909 17910 MVT SVT = VT.getSimpleVT(); 17911 17912 // Very little shuffling can be done for 64-bit vectors right now. 17913 if (VT.getSizeInBits() == 64) 17914 return false; 17915 17916 // If this is a single-input shuffle with no 128 bit lane crossings we can 17917 // lower it into pshufb. 17918 if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || 17919 (SVT.is256BitVector() && Subtarget->hasInt256())) { 17920 bool isLegal = true; 17921 for (unsigned I = 0, E = M.size(); I != E; ++I) { 17922 if (M[I] >= (int)SVT.getVectorNumElements() || 17923 ShuffleCrosses128bitLane(SVT, I, M[I])) { 17924 isLegal = false; 17925 break; 17926 } 17927 } 17928 if (isLegal) 17929 return true; 17930 } 17931 17932 // FIXME: blends, shifts. 17933 return (SVT.getVectorNumElements() == 2 || 17934 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 17935 isMOVLMask(M, SVT) || 17936 isMOVHLPSMask(M, SVT) || 17937 isSHUFPMask(M, SVT) || 17938 isPSHUFDMask(M, SVT) || 17939 isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || 17940 isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || 17941 isPALIGNRMask(M, SVT, Subtarget) || 17942 isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || 17943 isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || 17944 isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || 17945 isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || 17946 isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); 17947 } 17948 17949 bool 17950 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 17951 EVT VT) const { 17952 if (!VT.isSimple()) 17953 return false; 17954 17955 MVT SVT = VT.getSimpleVT(); 17956 unsigned NumElts = SVT.getVectorNumElements(); 17957 // FIXME: This collection of masks seems suspect. 17958 if (NumElts == 2) 17959 return true; 17960 if (NumElts == 4 && SVT.is128BitVector()) { 17961 return (isMOVLMask(Mask, SVT) || 17962 isCommutedMOVLMask(Mask, SVT, true) || 17963 isSHUFPMask(Mask, SVT) || 17964 isSHUFPMask(Mask, SVT, /* Commuted */ true)); 17965 } 17966 return false; 17967 } 17968 17969 //===----------------------------------------------------------------------===// 17970 // X86 Scheduler Hooks 17971 //===----------------------------------------------------------------------===// 17972 17973 /// Utility function to emit xbegin specifying the start of an RTM region. 17974 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, 17975 const TargetInstrInfo *TII) { 17976 DebugLoc DL = MI->getDebugLoc(); 17977 17978 const BasicBlock *BB = MBB->getBasicBlock(); 17979 MachineFunction::iterator I = MBB; 17980 ++I; 17981 17982 // For the v = xbegin(), we generate 17983 // 17984 // thisMBB: 17985 // xbegin sinkMBB 17986 // 17987 // mainMBB: 17988 // eax = -1 17989 // 17990 // sinkMBB: 17991 // v = eax 17992 17993 MachineBasicBlock *thisMBB = MBB; 17994 MachineFunction *MF = MBB->getParent(); 17995 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 17996 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 17997 MF->insert(I, mainMBB); 17998 MF->insert(I, sinkMBB); 17999 18000 // Transfer the remainder of BB and its successor edges to sinkMBB. 18001 sinkMBB->splice(sinkMBB->begin(), MBB, 18002 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 18003 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 18004 18005 // thisMBB: 18006 // xbegin sinkMBB 18007 // # fallthrough to mainMBB 18008 // # abortion to sinkMBB 18009 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); 18010 thisMBB->addSuccessor(mainMBB); 18011 thisMBB->addSuccessor(sinkMBB); 18012 18013 // mainMBB: 18014 // EAX = -1 18015 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); 18016 mainMBB->addSuccessor(sinkMBB); 18017 18018 // sinkMBB: 18019 // EAX is live into the sinkMBB 18020 sinkMBB->addLiveIn(X86::EAX); 18021 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 18022 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 18023 .addReg(X86::EAX); 18024 18025 MI->eraseFromParent(); 18026 return sinkMBB; 18027 } 18028 18029 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 18030 // or XMM0_V32I8 in AVX all of this code can be replaced with that 18031 // in the .td file. 18032 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, 18033 const TargetInstrInfo *TII) { 18034 unsigned Opc; 18035 switch (MI->getOpcode()) { 18036 default: llvm_unreachable("illegal opcode!"); 18037 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; 18038 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; 18039 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; 18040 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; 18041 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; 18042 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; 18043 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; 18044 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; 18045 } 18046 18047 DebugLoc dl = MI->getDebugLoc(); 18048 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 18049 18050 unsigned NumArgs = MI->getNumOperands(); 18051 for (unsigned i = 1; i < NumArgs; ++i) { 18052 MachineOperand &Op = MI->getOperand(i); 18053 if (!(Op.isReg() && Op.isImplicit())) 18054 MIB.addOperand(Op); 18055 } 18056 if (MI->hasOneMemOperand()) 18057 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 18058 18059 BuildMI(*BB, MI, dl, 18060 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 18061 .addReg(X86::XMM0); 18062 18063 MI->eraseFromParent(); 18064 return BB; 18065 } 18066 18067 // FIXME: Custom handling because TableGen doesn't support multiple implicit 18068 // defs in an instruction pattern 18069 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, 18070 const TargetInstrInfo *TII) { 18071 unsigned Opc; 18072 switch (MI->getOpcode()) { 18073 default: llvm_unreachable("illegal opcode!"); 18074 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; 18075 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; 18076 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; 18077 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; 18078 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; 18079 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; 18080 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; 18081 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; 18082 } 18083 18084 DebugLoc dl = MI->getDebugLoc(); 18085 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 18086 18087 unsigned NumArgs = MI->getNumOperands(); // remove the results 18088 for (unsigned i = 1; i < NumArgs; ++i) { 18089 MachineOperand &Op = MI->getOperand(i); 18090 if (!(Op.isReg() && Op.isImplicit())) 18091 MIB.addOperand(Op); 18092 } 18093 if (MI->hasOneMemOperand()) 18094 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 18095 18096 BuildMI(*BB, MI, dl, 18097 TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) 18098 .addReg(X86::ECX); 18099 18100 MI->eraseFromParent(); 18101 return BB; 18102 } 18103 18104 static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, 18105 const TargetInstrInfo *TII, 18106 const X86Subtarget* Subtarget) { 18107 DebugLoc dl = MI->getDebugLoc(); 18108 18109 // Address into RAX/EAX, other two args into ECX, EDX. 18110 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 18111 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 18112 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 18113 for (int i = 0; i < X86::AddrNumOperands; ++i) 18114 MIB.addOperand(MI->getOperand(i)); 18115 18116 unsigned ValOps = X86::AddrNumOperands; 18117 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 18118 .addReg(MI->getOperand(ValOps).getReg()); 18119 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 18120 .addReg(MI->getOperand(ValOps+1).getReg()); 18121 18122 // The instruction doesn't actually take any operands though. 18123 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 18124 18125 MI->eraseFromParent(); // The pseudo is gone now. 18126 return BB; 18127 } 18128 18129 MachineBasicBlock * 18130 X86TargetLowering::EmitVAARG64WithCustomInserter( 18131 MachineInstr *MI, 18132 MachineBasicBlock *MBB) const { 18133 // Emit va_arg instruction on X86-64. 18134 18135 // Operands to this pseudo-instruction: 18136 // 0 ) Output : destination address (reg) 18137 // 1-5) Input : va_list address (addr, i64mem) 18138 // 6 ) ArgSize : Size (in bytes) of vararg type 18139 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 18140 // 8 ) Align : Alignment of type 18141 // 9 ) EFLAGS (implicit-def) 18142 18143 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 18144 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 18145 18146 unsigned DestReg = MI->getOperand(0).getReg(); 18147 MachineOperand &Base = MI->getOperand(1); 18148 MachineOperand &Scale = MI->getOperand(2); 18149 MachineOperand &Index = MI->getOperand(3); 18150 MachineOperand &Disp = MI->getOperand(4); 18151 MachineOperand &Segment = MI->getOperand(5); 18152 unsigned ArgSize = MI->getOperand(6).getImm(); 18153 unsigned ArgMode = MI->getOperand(7).getImm(); 18154 unsigned Align = MI->getOperand(8).getImm(); 18155 18156 // Memory Reference 18157 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 18158 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 18159 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 18160 18161 // Machine Information 18162 const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); 18163 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 18164 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 18165 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 18166 DebugLoc DL = MI->getDebugLoc(); 18167 18168 // struct va_list { 18169 // i32 gp_offset 18170 // i32 fp_offset 18171 // i64 overflow_area (address) 18172 // i64 reg_save_area (address) 18173 // } 18174 // sizeof(va_list) = 24 18175 // alignment(va_list) = 8 18176 18177 unsigned TotalNumIntRegs = 6; 18178 unsigned TotalNumXMMRegs = 8; 18179 bool UseGPOffset = (ArgMode == 1); 18180 bool UseFPOffset = (ArgMode == 2); 18181 unsigned MaxOffset = TotalNumIntRegs * 8 + 18182 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 18183 18184 /* Align ArgSize to a multiple of 8 */ 18185 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 18186 bool NeedsAlign = (Align > 8); 18187 18188 MachineBasicBlock *thisMBB = MBB; 18189 MachineBasicBlock *overflowMBB; 18190 MachineBasicBlock *offsetMBB; 18191 MachineBasicBlock *endMBB; 18192 18193 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 18194 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 18195 unsigned OffsetReg = 0; 18196 18197 if (!UseGPOffset && !UseFPOffset) { 18198 // If we only pull from the overflow region, we don't create a branch. 18199 // We don't need to alter control flow. 18200 OffsetDestReg = 0; // unused 18201 OverflowDestReg = DestReg; 18202 18203 offsetMBB = nullptr; 18204 overflowMBB = thisMBB; 18205 endMBB = thisMBB; 18206 } else { 18207 // First emit code to check if gp_offset (or fp_offset) is below the bound. 18208 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 18209 // If not, pull from overflow_area. (branch to overflowMBB) 18210 // 18211 // thisMBB 18212 // | . 18213 // | . 18214 // offsetMBB overflowMBB 18215 // | . 18216 // | . 18217 // endMBB 18218 18219 // Registers for the PHI in endMBB 18220 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 18221 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 18222 18223 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 18224 MachineFunction *MF = MBB->getParent(); 18225 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18226 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18227 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18228 18229 MachineFunction::iterator MBBIter = MBB; 18230 ++MBBIter; 18231 18232 // Insert the new basic blocks 18233 MF->insert(MBBIter, offsetMBB); 18234 MF->insert(MBBIter, overflowMBB); 18235 MF->insert(MBBIter, endMBB); 18236 18237 // Transfer the remainder of MBB and its successor edges to endMBB. 18238 endMBB->splice(endMBB->begin(), thisMBB, 18239 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); 18240 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 18241 18242 // Make offsetMBB and overflowMBB successors of thisMBB 18243 thisMBB->addSuccessor(offsetMBB); 18244 thisMBB->addSuccessor(overflowMBB); 18245 18246 // endMBB is a successor of both offsetMBB and overflowMBB 18247 offsetMBB->addSuccessor(endMBB); 18248 overflowMBB->addSuccessor(endMBB); 18249 18250 // Load the offset value into a register 18251 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 18252 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 18253 .addOperand(Base) 18254 .addOperand(Scale) 18255 .addOperand(Index) 18256 .addDisp(Disp, UseFPOffset ? 4 : 0) 18257 .addOperand(Segment) 18258 .setMemRefs(MMOBegin, MMOEnd); 18259 18260 // Check if there is enough room left to pull this argument. 18261 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 18262 .addReg(OffsetReg) 18263 .addImm(MaxOffset + 8 - ArgSizeA8); 18264 18265 // Branch to "overflowMBB" if offset >= max 18266 // Fall through to "offsetMBB" otherwise 18267 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 18268 .addMBB(overflowMBB); 18269 } 18270 18271 // In offsetMBB, emit code to use the reg_save_area. 18272 if (offsetMBB) { 18273 assert(OffsetReg != 0); 18274 18275 // Read the reg_save_area address. 18276 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 18277 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 18278 .addOperand(Base) 18279 .addOperand(Scale) 18280 .addOperand(Index) 18281 .addDisp(Disp, 16) 18282 .addOperand(Segment) 18283 .setMemRefs(MMOBegin, MMOEnd); 18284 18285 // Zero-extend the offset 18286 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 18287 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 18288 .addImm(0) 18289 .addReg(OffsetReg) 18290 .addImm(X86::sub_32bit); 18291 18292 // Add the offset to the reg_save_area to get the final address. 18293 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 18294 .addReg(OffsetReg64) 18295 .addReg(RegSaveReg); 18296 18297 // Compute the offset for the next argument 18298 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 18299 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 18300 .addReg(OffsetReg) 18301 .addImm(UseFPOffset ? 16 : 8); 18302 18303 // Store it back into the va_list. 18304 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 18305 .addOperand(Base) 18306 .addOperand(Scale) 18307 .addOperand(Index) 18308 .addDisp(Disp, UseFPOffset ? 4 : 0) 18309 .addOperand(Segment) 18310 .addReg(NextOffsetReg) 18311 .setMemRefs(MMOBegin, MMOEnd); 18312 18313 // Jump to endMBB 18314 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 18315 .addMBB(endMBB); 18316 } 18317 18318 // 18319 // Emit code to use overflow area 18320 // 18321 18322 // Load the overflow_area address into a register. 18323 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 18324 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 18325 .addOperand(Base) 18326 .addOperand(Scale) 18327 .addOperand(Index) 18328 .addDisp(Disp, 8) 18329 .addOperand(Segment) 18330 .setMemRefs(MMOBegin, MMOEnd); 18331 18332 // If we need to align it, do so. Otherwise, just copy the address 18333 // to OverflowDestReg. 18334 if (NeedsAlign) { 18335 // Align the overflow address 18336 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 18337 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 18338 18339 // aligned_addr = (addr + (align-1)) & ~(align-1) 18340 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 18341 .addReg(OverflowAddrReg) 18342 .addImm(Align-1); 18343 18344 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 18345 .addReg(TmpReg) 18346 .addImm(~(uint64_t)(Align-1)); 18347 } else { 18348 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 18349 .addReg(OverflowAddrReg); 18350 } 18351 18352 // Compute the next overflow address after this argument. 18353 // (the overflow address should be kept 8-byte aligned) 18354 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 18355 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 18356 .addReg(OverflowDestReg) 18357 .addImm(ArgSizeA8); 18358 18359 // Store the new overflow address. 18360 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 18361 .addOperand(Base) 18362 .addOperand(Scale) 18363 .addOperand(Index) 18364 .addDisp(Disp, 8) 18365 .addOperand(Segment) 18366 .addReg(NextAddrReg) 18367 .setMemRefs(MMOBegin, MMOEnd); 18368 18369 // If we branched, emit the PHI to the front of endMBB. 18370 if (offsetMBB) { 18371 BuildMI(*endMBB, endMBB->begin(), DL, 18372 TII->get(X86::PHI), DestReg) 18373 .addReg(OffsetDestReg).addMBB(offsetMBB) 18374 .addReg(OverflowDestReg).addMBB(overflowMBB); 18375 } 18376 18377 // Erase the pseudo instruction 18378 MI->eraseFromParent(); 18379 18380 return endMBB; 18381 } 18382 18383 MachineBasicBlock * 18384 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 18385 MachineInstr *MI, 18386 MachineBasicBlock *MBB) const { 18387 // Emit code to save XMM registers to the stack. The ABI says that the 18388 // number of registers to save is given in %al, so it's theoretically 18389 // possible to do an indirect jump trick to avoid saving all of them, 18390 // however this code takes a simpler approach and just executes all 18391 // of the stores if %al is non-zero. It's less code, and it's probably 18392 // easier on the hardware branch predictor, and stores aren't all that 18393 // expensive anyway. 18394 18395 // Create the new basic blocks. One block contains all the XMM stores, 18396 // and one block is the final destination regardless of whether any 18397 // stores were performed. 18398 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 18399 MachineFunction *F = MBB->getParent(); 18400 MachineFunction::iterator MBBIter = MBB; 18401 ++MBBIter; 18402 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 18403 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 18404 F->insert(MBBIter, XMMSaveMBB); 18405 F->insert(MBBIter, EndMBB); 18406 18407 // Transfer the remainder of MBB and its successor edges to EndMBB. 18408 EndMBB->splice(EndMBB->begin(), MBB, 18409 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 18410 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 18411 18412 // The original block will now fall through to the XMM save block. 18413 MBB->addSuccessor(XMMSaveMBB); 18414 // The XMMSaveMBB will fall through to the end block. 18415 XMMSaveMBB->addSuccessor(EndMBB); 18416 18417 // Now add the instructions. 18418 const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); 18419 DebugLoc DL = MI->getDebugLoc(); 18420 18421 unsigned CountReg = MI->getOperand(0).getReg(); 18422 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 18423 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 18424 18425 if (!Subtarget->isTargetWin64()) { 18426 // If %al is 0, branch around the XMM save block. 18427 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 18428 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 18429 MBB->addSuccessor(EndMBB); 18430 } 18431 18432 // Make sure the last operand is EFLAGS, which gets clobbered by the branch 18433 // that was just emitted, but clearly shouldn't be "saved". 18434 assert((MI->getNumOperands() <= 3 || 18435 !MI->getOperand(MI->getNumOperands() - 1).isReg() || 18436 MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) 18437 && "Expected last argument to be EFLAGS"); 18438 unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; 18439 // In the XMM save block, save all the XMM argument registers. 18440 for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { 18441 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 18442 MachineMemOperand *MMO = 18443 F->getMachineMemOperand( 18444 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 18445 MachineMemOperand::MOStore, 18446 /*Size=*/16, /*Align=*/16); 18447 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) 18448 .addFrameIndex(RegSaveFrameIndex) 18449 .addImm(/*Scale=*/1) 18450 .addReg(/*IndexReg=*/0) 18451 .addImm(/*Disp=*/Offset) 18452 .addReg(/*Segment=*/0) 18453 .addReg(MI->getOperand(i).getReg()) 18454 .addMemOperand(MMO); 18455 } 18456 18457 MI->eraseFromParent(); // The pseudo instruction is gone now. 18458 18459 return EndMBB; 18460 } 18461 18462 // The EFLAGS operand of SelectItr might be missing a kill marker 18463 // because there were multiple uses of EFLAGS, and ISel didn't know 18464 // which to mark. Figure out whether SelectItr should have had a 18465 // kill marker, and set it if it should. Returns the correct kill 18466 // marker value. 18467 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, 18468 MachineBasicBlock* BB, 18469 const TargetRegisterInfo* TRI) { 18470 // Scan forward through BB for a use/def of EFLAGS. 18471 MachineBasicBlock::iterator miI(std::next(SelectItr)); 18472 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 18473 const MachineInstr& mi = *miI; 18474 if (mi.readsRegister(X86::EFLAGS)) 18475 return false; 18476 if (mi.definesRegister(X86::EFLAGS)) 18477 break; // Should have kill-flag - update below. 18478 } 18479 18480 // If we hit the end of the block, check whether EFLAGS is live into a 18481 // successor. 18482 if (miI == BB->end()) { 18483 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 18484 sEnd = BB->succ_end(); 18485 sItr != sEnd; ++sItr) { 18486 MachineBasicBlock* succ = *sItr; 18487 if (succ->isLiveIn(X86::EFLAGS)) 18488 return false; 18489 } 18490 } 18491 18492 // We found a def, or hit the end of the basic block and EFLAGS wasn't live 18493 // out. SelectMI should have a kill flag on EFLAGS. 18494 SelectItr->addRegisterKilled(X86::EFLAGS, TRI); 18495 return true; 18496 } 18497 18498 MachineBasicBlock * 18499 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 18500 MachineBasicBlock *BB) const { 18501 const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); 18502 DebugLoc DL = MI->getDebugLoc(); 18503 18504 // To "insert" a SELECT_CC instruction, we actually have to insert the 18505 // diamond control-flow pattern. The incoming instruction knows the 18506 // destination vreg to set, the condition code register to branch on, the 18507 // true/false values to select between, and a branch opcode to use. 18508 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 18509 MachineFunction::iterator It = BB; 18510 ++It; 18511 18512 // thisMBB: 18513 // ... 18514 // TrueVal = ... 18515 // cmpTY ccX, r1, r2 18516 // bCC copy1MBB 18517 // fallthrough --> copy0MBB 18518 MachineBasicBlock *thisMBB = BB; 18519 MachineFunction *F = BB->getParent(); 18520 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 18521 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 18522 F->insert(It, copy0MBB); 18523 F->insert(It, sinkMBB); 18524 18525 // If the EFLAGS register isn't dead in the terminator, then claim that it's 18526 // live into the sink and copy blocks. 18527 const TargetRegisterInfo *TRI = 18528 BB->getParent()->getSubtarget().getRegisterInfo(); 18529 if (!MI->killsRegister(X86::EFLAGS) && 18530 !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { 18531 copy0MBB->addLiveIn(X86::EFLAGS); 18532 sinkMBB->addLiveIn(X86::EFLAGS); 18533 } 18534 18535 // Transfer the remainder of BB and its successor edges to sinkMBB. 18536 sinkMBB->splice(sinkMBB->begin(), BB, 18537 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 18538 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 18539 18540 // Add the true and fallthrough blocks as its successors. 18541 BB->addSuccessor(copy0MBB); 18542 BB->addSuccessor(sinkMBB); 18543 18544 // Create the conditional branch instruction. 18545 unsigned Opc = 18546 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 18547 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 18548 18549 // copy0MBB: 18550 // %FalseValue = ... 18551 // # fallthrough to sinkMBB 18552 copy0MBB->addSuccessor(sinkMBB); 18553 18554 // sinkMBB: 18555 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 18556 // ... 18557 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 18558 TII->get(X86::PHI), MI->getOperand(0).getReg()) 18559 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 18560 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 18561 18562 MI->eraseFromParent(); // The pseudo instruction is gone now. 18563 return sinkMBB; 18564 } 18565 18566 MachineBasicBlock * 18567 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, 18568 bool Is64Bit) const { 18569 MachineFunction *MF = BB->getParent(); 18570 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 18571 DebugLoc DL = MI->getDebugLoc(); 18572 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 18573 18574 assert(MF->shouldSplitStack()); 18575 18576 unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; 18577 unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; 18578 18579 // BB: 18580 // ... [Till the alloca] 18581 // If stacklet is not large enough, jump to mallocMBB 18582 // 18583 // bumpMBB: 18584 // Allocate by subtracting from RSP 18585 // Jump to continueMBB 18586 // 18587 // mallocMBB: 18588 // Allocate by call to runtime 18589 // 18590 // continueMBB: 18591 // ... 18592 // [rest of original BB] 18593 // 18594 18595 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18596 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18597 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); 18598 18599 MachineRegisterInfo &MRI = MF->getRegInfo(); 18600 const TargetRegisterClass *AddrRegClass = 18601 getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); 18602 18603 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), 18604 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), 18605 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), 18606 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), 18607 sizeVReg = MI->getOperand(1).getReg(), 18608 physSPReg = Is64Bit ? X86::RSP : X86::ESP; 18609 18610 MachineFunction::iterator MBBIter = BB; 18611 ++MBBIter; 18612 18613 MF->insert(MBBIter, bumpMBB); 18614 MF->insert(MBBIter, mallocMBB); 18615 MF->insert(MBBIter, continueMBB); 18616 18617 continueMBB->splice(continueMBB->begin(), BB, 18618 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 18619 continueMBB->transferSuccessorsAndUpdatePHIs(BB); 18620 18621 // Add code to the main basic block to check if the stack limit has been hit, 18622 // and if so, jump to mallocMBB otherwise to bumpMBB. 18623 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); 18624 BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) 18625 .addReg(tmpSPVReg).addReg(sizeVReg); 18626 BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) 18627 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) 18628 .addReg(SPLimitVReg); 18629 BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); 18630 18631 // bumpMBB simply decreases the stack pointer, since we know the current 18632 // stacklet has enough space. 18633 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) 18634 .addReg(SPLimitVReg); 18635 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) 18636 .addReg(SPLimitVReg); 18637 BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 18638 18639 // Calls into a routine in libgcc to allocate more space from the heap. 18640 const uint32_t *RegMask = MF->getTarget() 18641 .getSubtargetImpl() 18642 ->getRegisterInfo() 18643 ->getCallPreservedMask(CallingConv::C); 18644 if (Is64Bit) { 18645 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) 18646 .addReg(sizeVReg); 18647 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) 18648 .addExternalSymbol("__morestack_allocate_stack_space") 18649 .addRegMask(RegMask) 18650 .addReg(X86::RDI, RegState::Implicit) 18651 .addReg(X86::RAX, RegState::ImplicitDefine); 18652 } else { 18653 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) 18654 .addImm(12); 18655 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); 18656 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) 18657 .addExternalSymbol("__morestack_allocate_stack_space") 18658 .addRegMask(RegMask) 18659 .addReg(X86::EAX, RegState::ImplicitDefine); 18660 } 18661 18662 if (!Is64Bit) 18663 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) 18664 .addImm(16); 18665 18666 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) 18667 .addReg(Is64Bit ? X86::RAX : X86::EAX); 18668 BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); 18669 18670 // Set up the CFG correctly. 18671 BB->addSuccessor(bumpMBB); 18672 BB->addSuccessor(mallocMBB); 18673 mallocMBB->addSuccessor(continueMBB); 18674 bumpMBB->addSuccessor(continueMBB); 18675 18676 // Take care of the PHI nodes. 18677 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), 18678 MI->getOperand(0).getReg()) 18679 .addReg(mallocPtrVReg).addMBB(mallocMBB) 18680 .addReg(bumpSPPtrVReg).addMBB(bumpMBB); 18681 18682 // Delete the original pseudo instruction. 18683 MI->eraseFromParent(); 18684 18685 // And we're done. 18686 return continueMBB; 18687 } 18688 18689 MachineBasicBlock * 18690 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 18691 MachineBasicBlock *BB) const { 18692 const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); 18693 DebugLoc DL = MI->getDebugLoc(); 18694 18695 assert(!Subtarget->isTargetMacho()); 18696 18697 // The lowering is pretty easy: we're just emitting the call to _alloca. The 18698 // non-trivial part is impdef of ESP. 18699 18700 if (Subtarget->isTargetWin64()) { 18701 if (Subtarget->isTargetCygMing()) { 18702 // ___chkstk(Mingw64): 18703 // Clobbers R10, R11, RAX and EFLAGS. 18704 // Updates RSP. 18705 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 18706 .addExternalSymbol("___chkstk") 18707 .addReg(X86::RAX, RegState::Implicit) 18708 .addReg(X86::RSP, RegState::Implicit) 18709 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 18710 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 18711 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 18712 } else { 18713 // __chkstk(MSVCRT): does not update stack pointer. 18714 // Clobbers R10, R11 and EFLAGS. 18715 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 18716 .addExternalSymbol("__chkstk") 18717 .addReg(X86::RAX, RegState::Implicit) 18718 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 18719 // RAX has the offset to be subtracted from RSP. 18720 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 18721 .addReg(X86::RSP) 18722 .addReg(X86::RAX); 18723 } 18724 } else { 18725 const char *StackProbeSymbol = 18726 Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca"; 18727 18728 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 18729 .addExternalSymbol(StackProbeSymbol) 18730 .addReg(X86::EAX, RegState::Implicit) 18731 .addReg(X86::ESP, RegState::Implicit) 18732 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 18733 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 18734 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 18735 } 18736 18737 MI->eraseFromParent(); // The pseudo instruction is gone now. 18738 return BB; 18739 } 18740 18741 MachineBasicBlock * 18742 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 18743 MachineBasicBlock *BB) const { 18744 // This is pretty easy. We're taking the value that we received from 18745 // our load from the relocation, sticking it in either RDI (x86-64) 18746 // or EAX and doing an indirect call. The return value will then 18747 // be in the normal return register. 18748 MachineFunction *F = BB->getParent(); 18749 const X86InstrInfo *TII = 18750 static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo()); 18751 DebugLoc DL = MI->getDebugLoc(); 18752 18753 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 18754 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 18755 18756 // Get a register mask for the lowered call. 18757 // FIXME: The 32-bit calls have non-standard calling conventions. Use a 18758 // proper register mask. 18759 const uint32_t *RegMask = F->getTarget() 18760 .getSubtargetImpl() 18761 ->getRegisterInfo() 18762 ->getCallPreservedMask(CallingConv::C); 18763 if (Subtarget->is64Bit()) { 18764 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 18765 TII->get(X86::MOV64rm), X86::RDI) 18766 .addReg(X86::RIP) 18767 .addImm(0).addReg(0) 18768 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 18769 MI->getOperand(3).getTargetFlags()) 18770 .addReg(0); 18771 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 18772 addDirectMem(MIB, X86::RDI); 18773 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); 18774 } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { 18775 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 18776 TII->get(X86::MOV32rm), X86::EAX) 18777 .addReg(0) 18778 .addImm(0).addReg(0) 18779 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 18780 MI->getOperand(3).getTargetFlags()) 18781 .addReg(0); 18782 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 18783 addDirectMem(MIB, X86::EAX); 18784 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 18785 } else { 18786 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 18787 TII->get(X86::MOV32rm), X86::EAX) 18788 .addReg(TII->getGlobalBaseReg(F)) 18789 .addImm(0).addReg(0) 18790 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 18791 MI->getOperand(3).getTargetFlags()) 18792 .addReg(0); 18793 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 18794 addDirectMem(MIB, X86::EAX); 18795 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); 18796 } 18797 18798 MI->eraseFromParent(); // The pseudo instruction is gone now. 18799 return BB; 18800 } 18801 18802 MachineBasicBlock * 18803 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, 18804 MachineBasicBlock *MBB) const { 18805 DebugLoc DL = MI->getDebugLoc(); 18806 MachineFunction *MF = MBB->getParent(); 18807 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 18808 MachineRegisterInfo &MRI = MF->getRegInfo(); 18809 18810 const BasicBlock *BB = MBB->getBasicBlock(); 18811 MachineFunction::iterator I = MBB; 18812 ++I; 18813 18814 // Memory Reference 18815 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 18816 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 18817 18818 unsigned DstReg; 18819 unsigned MemOpndSlot = 0; 18820 18821 unsigned CurOp = 0; 18822 18823 DstReg = MI->getOperand(CurOp++).getReg(); 18824 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 18825 assert(RC->hasType(MVT::i32) && "Invalid destination!"); 18826 unsigned mainDstReg = MRI.createVirtualRegister(RC); 18827 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 18828 18829 MemOpndSlot = CurOp; 18830 18831 MVT PVT = getPointerTy(); 18832 assert((PVT == MVT::i64 || PVT == MVT::i32) && 18833 "Invalid Pointer Size!"); 18834 18835 // For v = setjmp(buf), we generate 18836 // 18837 // thisMBB: 18838 // buf[LabelOffset] = restoreMBB 18839 // SjLjSetup restoreMBB 18840 // 18841 // mainMBB: 18842 // v_main = 0 18843 // 18844 // sinkMBB: 18845 // v = phi(main, restore) 18846 // 18847 // restoreMBB: 18848 // v_restore = 1 18849 18850 MachineBasicBlock *thisMBB = MBB; 18851 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 18852 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 18853 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); 18854 MF->insert(I, mainMBB); 18855 MF->insert(I, sinkMBB); 18856 MF->push_back(restoreMBB); 18857 18858 MachineInstrBuilder MIB; 18859 18860 // Transfer the remainder of BB and its successor edges to sinkMBB. 18861 sinkMBB->splice(sinkMBB->begin(), MBB, 18862 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 18863 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 18864 18865 // thisMBB: 18866 unsigned PtrStoreOpc = 0; 18867 unsigned LabelReg = 0; 18868 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 18869 Reloc::Model RM = MF->getTarget().getRelocationModel(); 18870 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && 18871 (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); 18872 18873 // Prepare IP either in reg or imm. 18874 if (!UseImmLabel) { 18875 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; 18876 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 18877 LabelReg = MRI.createVirtualRegister(PtrRC); 18878 if (Subtarget->is64Bit()) { 18879 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) 18880 .addReg(X86::RIP) 18881 .addImm(0) 18882 .addReg(0) 18883 .addMBB(restoreMBB) 18884 .addReg(0); 18885 } else { 18886 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); 18887 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) 18888 .addReg(XII->getGlobalBaseReg(MF)) 18889 .addImm(0) 18890 .addReg(0) 18891 .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) 18892 .addReg(0); 18893 } 18894 } else 18895 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; 18896 // Store IP 18897 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); 18898 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 18899 if (i == X86::AddrDisp) 18900 MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); 18901 else 18902 MIB.addOperand(MI->getOperand(MemOpndSlot + i)); 18903 } 18904 if (!UseImmLabel) 18905 MIB.addReg(LabelReg); 18906 else 18907 MIB.addMBB(restoreMBB); 18908 MIB.setMemRefs(MMOBegin, MMOEnd); 18909 // Setup 18910 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) 18911 .addMBB(restoreMBB); 18912 18913 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 18914 MF->getSubtarget().getRegisterInfo()); 18915 MIB.addRegMask(RegInfo->getNoPreservedMask()); 18916 thisMBB->addSuccessor(mainMBB); 18917 thisMBB->addSuccessor(restoreMBB); 18918 18919 // mainMBB: 18920 // EAX = 0 18921 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); 18922 mainMBB->addSuccessor(sinkMBB); 18923 18924 // sinkMBB: 18925 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 18926 TII->get(X86::PHI), DstReg) 18927 .addReg(mainDstReg).addMBB(mainMBB) 18928 .addReg(restoreDstReg).addMBB(restoreMBB); 18929 18930 // restoreMBB: 18931 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); 18932 BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); 18933 restoreMBB->addSuccessor(sinkMBB); 18934 18935 MI->eraseFromParent(); 18936 return sinkMBB; 18937 } 18938 18939 MachineBasicBlock * 18940 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, 18941 MachineBasicBlock *MBB) const { 18942 DebugLoc DL = MI->getDebugLoc(); 18943 MachineFunction *MF = MBB->getParent(); 18944 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 18945 MachineRegisterInfo &MRI = MF->getRegInfo(); 18946 18947 // Memory Reference 18948 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 18949 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 18950 18951 MVT PVT = getPointerTy(); 18952 assert((PVT == MVT::i64 || PVT == MVT::i32) && 18953 "Invalid Pointer Size!"); 18954 18955 const TargetRegisterClass *RC = 18956 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; 18957 unsigned Tmp = MRI.createVirtualRegister(RC); 18958 // Since FP is only updated here but NOT referenced, it's treated as GPR. 18959 const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( 18960 MF->getSubtarget().getRegisterInfo()); 18961 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; 18962 unsigned SP = RegInfo->getStackRegister(); 18963 18964 MachineInstrBuilder MIB; 18965 18966 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 18967 const int64_t SPOffset = 2 * PVT.getStoreSize(); 18968 18969 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; 18970 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; 18971 18972 // Reload FP 18973 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); 18974 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) 18975 MIB.addOperand(MI->getOperand(i)); 18976 MIB.setMemRefs(MMOBegin, MMOEnd); 18977 // Reload IP 18978 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); 18979 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 18980 if (i == X86::AddrDisp) 18981 MIB.addDisp(MI->getOperand(i), LabelOffset); 18982 else 18983 MIB.addOperand(MI->getOperand(i)); 18984 } 18985 MIB.setMemRefs(MMOBegin, MMOEnd); 18986 // Reload SP 18987 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); 18988 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { 18989 if (i == X86::AddrDisp) 18990 MIB.addDisp(MI->getOperand(i), SPOffset); 18991 else 18992 MIB.addOperand(MI->getOperand(i)); 18993 } 18994 MIB.setMemRefs(MMOBegin, MMOEnd); 18995 // Jump 18996 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); 18997 18998 MI->eraseFromParent(); 18999 return MBB; 19000 } 19001 19002 // Replace 213-type (isel default) FMA3 instructions with 231-type for 19003 // accumulator loops. Writing back to the accumulator allows the coalescer 19004 // to remove extra copies in the loop. 19005 MachineBasicBlock * 19006 X86TargetLowering::emitFMA3Instr(MachineInstr *MI, 19007 MachineBasicBlock *MBB) const { 19008 MachineOperand &AddendOp = MI->getOperand(3); 19009 19010 // Bail out early if the addend isn't a register - we can't switch these. 19011 if (!AddendOp.isReg()) 19012 return MBB; 19013 19014 MachineFunction &MF = *MBB->getParent(); 19015 MachineRegisterInfo &MRI = MF.getRegInfo(); 19016 19017 // Check whether the addend is defined by a PHI: 19018 assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); 19019 MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); 19020 if (!AddendDef.isPHI()) 19021 return MBB; 19022 19023 // Look for the following pattern: 19024 // loop: 19025 // %addend = phi [%entry, 0], [%loop, %result] 19026 // ... 19027 // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend 19028 19029 // Replace with: 19030 // loop: 19031 // %addend = phi [%entry, 0], [%loop, %result] 19032 // ... 19033 // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2 19034 19035 for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { 19036 assert(AddendDef.getOperand(i).isReg()); 19037 MachineOperand PHISrcOp = AddendDef.getOperand(i); 19038 MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); 19039 if (&PHISrcInst == MI) { 19040 // Found a matching instruction. 19041 unsigned NewFMAOpc = 0; 19042 switch (MI->getOpcode()) { 19043 case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; 19044 case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; 19045 case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; 19046 case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; 19047 case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; 19048 case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; 19049 case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; 19050 case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; 19051 case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; 19052 case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; 19053 case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; 19054 case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; 19055 case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; 19056 case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; 19057 case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; 19058 case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; 19059 case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; 19060 case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; 19061 case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; 19062 case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; 19063 case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; 19064 case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; 19065 case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; 19066 case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; 19067 default: llvm_unreachable("Unrecognized FMA variant."); 19068 } 19069 19070 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); 19071 MachineInstrBuilder MIB = 19072 BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) 19073 .addOperand(MI->getOperand(0)) 19074 .addOperand(MI->getOperand(3)) 19075 .addOperand(MI->getOperand(2)) 19076 .addOperand(MI->getOperand(1)); 19077 MBB->insert(MachineBasicBlock::iterator(MI), MIB); 19078 MI->eraseFromParent(); 19079 } 19080 } 19081 19082 return MBB; 19083 } 19084 19085 MachineBasicBlock * 19086 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 19087 MachineBasicBlock *BB) const { 19088 switch (MI->getOpcode()) { 19089 default: llvm_unreachable("Unexpected instr type to insert"); 19090 case X86::TAILJMPd64: 19091 case X86::TAILJMPr64: 19092 case X86::TAILJMPm64: 19093 llvm_unreachable("TAILJMP64 would not be touched here."); 19094 case X86::TCRETURNdi64: 19095 case X86::TCRETURNri64: 19096 case X86::TCRETURNmi64: 19097 return BB; 19098 case X86::WIN_ALLOCA: 19099 return EmitLoweredWinAlloca(MI, BB); 19100 case X86::SEG_ALLOCA_32: 19101 return EmitLoweredSegAlloca(MI, BB, false); 19102 case X86::SEG_ALLOCA_64: 19103 return EmitLoweredSegAlloca(MI, BB, true); 19104 case X86::TLSCall_32: 19105 case X86::TLSCall_64: 19106 return EmitLoweredTLSCall(MI, BB); 19107 case X86::CMOV_GR8: 19108 case X86::CMOV_FR32: 19109 case X86::CMOV_FR64: 19110 case X86::CMOV_V4F32: 19111 case X86::CMOV_V2F64: 19112 case X86::CMOV_V2I64: 19113 case X86::CMOV_V8F32: 19114 case X86::CMOV_V4F64: 19115 case X86::CMOV_V4I64: 19116 case X86::CMOV_V16F32: 19117 case X86::CMOV_V8F64: 19118 case X86::CMOV_V8I64: 19119 case X86::CMOV_GR16: 19120 case X86::CMOV_GR32: 19121 case X86::CMOV_RFP32: 19122 case X86::CMOV_RFP64: 19123 case X86::CMOV_RFP80: 19124 return EmitLoweredSelect(MI, BB); 19125 19126 case X86::FP32_TO_INT16_IN_MEM: 19127 case X86::FP32_TO_INT32_IN_MEM: 19128 case X86::FP32_TO_INT64_IN_MEM: 19129 case X86::FP64_TO_INT16_IN_MEM: 19130 case X86::FP64_TO_INT32_IN_MEM: 19131 case X86::FP64_TO_INT64_IN_MEM: 19132 case X86::FP80_TO_INT16_IN_MEM: 19133 case X86::FP80_TO_INT32_IN_MEM: 19134 case X86::FP80_TO_INT64_IN_MEM: { 19135 MachineFunction *F = BB->getParent(); 19136 const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); 19137 DebugLoc DL = MI->getDebugLoc(); 19138 19139 // Change the floating point control register to use "round towards zero" 19140 // mode when truncating to an integer value. 19141 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 19142 addFrameReference(BuildMI(*BB, MI, DL, 19143 TII->get(X86::FNSTCW16m)), CWFrameIdx); 19144 19145 // Load the old value of the high byte of the control word... 19146 unsigned OldCW = 19147 F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); 19148 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 19149 CWFrameIdx); 19150 19151 // Set the high part to be round to zero... 19152 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 19153 .addImm(0xC7F); 19154 19155 // Reload the modified control word now... 19156 addFrameReference(BuildMI(*BB, MI, DL, 19157 TII->get(X86::FLDCW16m)), CWFrameIdx); 19158 19159 // Restore the memory image of control word to original value 19160 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 19161 .addReg(OldCW); 19162 19163 // Get the X86 opcode to use. 19164 unsigned Opc; 19165 switch (MI->getOpcode()) { 19166 default: llvm_unreachable("illegal opcode!"); 19167 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 19168 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 19169 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 19170 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 19171 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 19172 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 19173 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 19174 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 19175 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 19176 } 19177 19178 X86AddressMode AM; 19179 MachineOperand &Op = MI->getOperand(0); 19180 if (Op.isReg()) { 19181 AM.BaseType = X86AddressMode::RegBase; 19182 AM.Base.Reg = Op.getReg(); 19183 } else { 19184 AM.BaseType = X86AddressMode::FrameIndexBase; 19185 AM.Base.FrameIndex = Op.getIndex(); 19186 } 19187 Op = MI->getOperand(1); 19188 if (Op.isImm()) 19189 AM.Scale = Op.getImm(); 19190 Op = MI->getOperand(2); 19191 if (Op.isImm()) 19192 AM.IndexReg = Op.getImm(); 19193 Op = MI->getOperand(3); 19194 if (Op.isGlobal()) { 19195 AM.GV = Op.getGlobal(); 19196 } else { 19197 AM.Disp = Op.getImm(); 19198 } 19199 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 19200 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 19201 19202 // Reload the original control word now. 19203 addFrameReference(BuildMI(*BB, MI, DL, 19204 TII->get(X86::FLDCW16m)), CWFrameIdx); 19205 19206 MI->eraseFromParent(); // The pseudo instruction is gone now. 19207 return BB; 19208 } 19209 // String/text processing lowering. 19210 case X86::PCMPISTRM128REG: 19211 case X86::VPCMPISTRM128REG: 19212 case X86::PCMPISTRM128MEM: 19213 case X86::VPCMPISTRM128MEM: 19214 case X86::PCMPESTRM128REG: 19215 case X86::VPCMPESTRM128REG: 19216 case X86::PCMPESTRM128MEM: 19217 case X86::VPCMPESTRM128MEM: 19218 assert(Subtarget->hasSSE42() && 19219 "Target must have SSE4.2 or AVX features enabled"); 19220 return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); 19221 19222 // String/text processing lowering. 19223 case X86::PCMPISTRIREG: 19224 case X86::VPCMPISTRIREG: 19225 case X86::PCMPISTRIMEM: 19226 case X86::VPCMPISTRIMEM: 19227 case X86::PCMPESTRIREG: 19228 case X86::VPCMPESTRIREG: 19229 case X86::PCMPESTRIMEM: 19230 case X86::VPCMPESTRIMEM: 19231 assert(Subtarget->hasSSE42() && 19232 "Target must have SSE4.2 or AVX features enabled"); 19233 return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); 19234 19235 // Thread synchronization. 19236 case X86::MONITOR: 19237 return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), 19238 Subtarget); 19239 19240 // xbegin 19241 case X86::XBEGIN: 19242 return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); 19243 19244 case X86::VASTART_SAVE_XMM_REGS: 19245 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 19246 19247 case X86::VAARG_64: 19248 return EmitVAARG64WithCustomInserter(MI, BB); 19249 19250 case X86::EH_SjLj_SetJmp32: 19251 case X86::EH_SjLj_SetJmp64: 19252 return emitEHSjLjSetJmp(MI, BB); 19253 19254 case X86::EH_SjLj_LongJmp32: 19255 case X86::EH_SjLj_LongJmp64: 19256 return emitEHSjLjLongJmp(MI, BB); 19257 19258 case TargetOpcode::STACKMAP: 19259 case TargetOpcode::PATCHPOINT: 19260 return emitPatchPoint(MI, BB); 19261 19262 case X86::VFMADDPDr213r: 19263 case X86::VFMADDPSr213r: 19264 case X86::VFMADDSDr213r: 19265 case X86::VFMADDSSr213r: 19266 case X86::VFMSUBPDr213r: 19267 case X86::VFMSUBPSr213r: 19268 case X86::VFMSUBSDr213r: 19269 case X86::VFMSUBSSr213r: 19270 case X86::VFNMADDPDr213r: 19271 case X86::VFNMADDPSr213r: 19272 case X86::VFNMADDSDr213r: 19273 case X86::VFNMADDSSr213r: 19274 case X86::VFNMSUBPDr213r: 19275 case X86::VFNMSUBPSr213r: 19276 case X86::VFNMSUBSDr213r: 19277 case X86::VFNMSUBSSr213r: 19278 case X86::VFMADDPDr213rY: 19279 case X86::VFMADDPSr213rY: 19280 case X86::VFMSUBPDr213rY: 19281 case X86::VFMSUBPSr213rY: 19282 case X86::VFNMADDPDr213rY: 19283 case X86::VFNMADDPSr213rY: 19284 case X86::VFNMSUBPDr213rY: 19285 case X86::VFNMSUBPSr213rY: 19286 return emitFMA3Instr(MI, BB); 19287 } 19288 } 19289 19290 //===----------------------------------------------------------------------===// 19291 // X86 Optimization Hooks 19292 //===----------------------------------------------------------------------===// 19293 19294 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 19295 APInt &KnownZero, 19296 APInt &KnownOne, 19297 const SelectionDAG &DAG, 19298 unsigned Depth) const { 19299 unsigned BitWidth = KnownZero.getBitWidth(); 19300 unsigned Opc = Op.getOpcode(); 19301 assert((Opc >= ISD::BUILTIN_OP_END || 19302 Opc == ISD::INTRINSIC_WO_CHAIN || 19303 Opc == ISD::INTRINSIC_W_CHAIN || 19304 Opc == ISD::INTRINSIC_VOID) && 19305 "Should use MaskedValueIsZero if you don't know whether Op" 19306 " is a target node!"); 19307 19308 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. 19309 switch (Opc) { 19310 default: break; 19311 case X86ISD::ADD: 19312 case X86ISD::SUB: 19313 case X86ISD::ADC: 19314 case X86ISD::SBB: 19315 case X86ISD::SMUL: 19316 case X86ISD::UMUL: 19317 case X86ISD::INC: 19318 case X86ISD::DEC: 19319 case X86ISD::OR: 19320 case X86ISD::XOR: 19321 case X86ISD::AND: 19322 // These nodes' second result is a boolean. 19323 if (Op.getResNo() == 0) 19324 break; 19325 // Fallthrough 19326 case X86ISD::SETCC: 19327 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 19328 break; 19329 case ISD::INTRINSIC_WO_CHAIN: { 19330 unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 19331 unsigned NumLoBits = 0; 19332 switch (IntId) { 19333 default: break; 19334 case Intrinsic::x86_sse_movmsk_ps: 19335 case Intrinsic::x86_avx_movmsk_ps_256: 19336 case Intrinsic::x86_sse2_movmsk_pd: 19337 case Intrinsic::x86_avx_movmsk_pd_256: 19338 case Intrinsic::x86_mmx_pmovmskb: 19339 case Intrinsic::x86_sse2_pmovmskb_128: 19340 case Intrinsic::x86_avx2_pmovmskb: { 19341 // High bits of movmskp{s|d}, pmovmskb are known zero. 19342 switch (IntId) { 19343 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 19344 case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; 19345 case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; 19346 case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; 19347 case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; 19348 case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; 19349 case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; 19350 case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; 19351 } 19352 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); 19353 break; 19354 } 19355 } 19356 break; 19357 } 19358 } 19359 } 19360 19361 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( 19362 SDValue Op, 19363 const SelectionDAG &, 19364 unsigned Depth) const { 19365 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 19366 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 19367 return Op.getValueType().getScalarType().getSizeInBits(); 19368 19369 // Fallback case. 19370 return 1; 19371 } 19372 19373 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 19374 /// node is a GlobalAddress + offset. 19375 bool X86TargetLowering::isGAPlusOffset(SDNode *N, 19376 const GlobalValue* &GA, 19377 int64_t &Offset) const { 19378 if (N->getOpcode() == X86ISD::Wrapper) { 19379 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 19380 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 19381 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 19382 return true; 19383 } 19384 } 19385 return TargetLowering::isGAPlusOffset(N, GA, Offset); 19386 } 19387 19388 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the 19389 /// same as extracting the high 128-bit part of 256-bit vector and then 19390 /// inserting the result into the low part of a new 256-bit vector 19391 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { 19392 EVT VT = SVOp->getValueType(0); 19393 unsigned NumElems = VT.getVectorNumElements(); 19394 19395 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 19396 for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) 19397 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 19398 SVOp->getMaskElt(j) >= 0) 19399 return false; 19400 19401 return true; 19402 } 19403 19404 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the 19405 /// same as extracting the low 128-bit part of 256-bit vector and then 19406 /// inserting the result into the high part of a new 256-bit vector 19407 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { 19408 EVT VT = SVOp->getValueType(0); 19409 unsigned NumElems = VT.getVectorNumElements(); 19410 19411 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 19412 for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) 19413 if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || 19414 SVOp->getMaskElt(j) >= 0) 19415 return false; 19416 19417 return true; 19418 } 19419 19420 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 19421 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 19422 TargetLowering::DAGCombinerInfo &DCI, 19423 const X86Subtarget* Subtarget) { 19424 SDLoc dl(N); 19425 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 19426 SDValue V1 = SVOp->getOperand(0); 19427 SDValue V2 = SVOp->getOperand(1); 19428 EVT VT = SVOp->getValueType(0); 19429 unsigned NumElems = VT.getVectorNumElements(); 19430 19431 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 19432 V2.getOpcode() == ISD::CONCAT_VECTORS) { 19433 // 19434 // 0,0,0,... 19435 // | 19436 // V UNDEF BUILD_VECTOR UNDEF 19437 // \ / \ / 19438 // CONCAT_VECTOR CONCAT_VECTOR 19439 // \ / 19440 // \ / 19441 // RESULT: V + zero extended 19442 // 19443 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 19444 V2.getOperand(1).getOpcode() != ISD::UNDEF || 19445 V1.getOperand(1).getOpcode() != ISD::UNDEF) 19446 return SDValue(); 19447 19448 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 19449 return SDValue(); 19450 19451 // To match the shuffle mask, the first half of the mask should 19452 // be exactly the first vector, and all the rest a splat with the 19453 // first element of the second one. 19454 for (unsigned i = 0; i != NumElems/2; ++i) 19455 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 19456 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 19457 return SDValue(); 19458 19459 // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. 19460 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { 19461 if (Ld->hasNUsesOfValue(1, 0)) { 19462 SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); 19463 SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; 19464 SDValue ResNode = 19465 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 19466 Ld->getMemoryVT(), 19467 Ld->getPointerInfo(), 19468 Ld->getAlignment(), 19469 false/*isVolatile*/, true/*ReadMem*/, 19470 false/*WriteMem*/); 19471 19472 // Make sure the newly-created LOAD is in the same position as Ld in 19473 // terms of dependency. We create a TokenFactor for Ld and ResNode, 19474 // and update uses of Ld's output chain to use the TokenFactor. 19475 if (Ld->hasAnyUseOfValue(1)) { 19476 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 19477 SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); 19478 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); 19479 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), 19480 SDValue(ResNode.getNode(), 1)); 19481 } 19482 19483 return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); 19484 } 19485 } 19486 19487 // Emit a zeroed vector and insert the desired subvector on its 19488 // first half. 19489 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); 19490 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); 19491 return DCI.CombineTo(N, InsV); 19492 } 19493 19494 //===--------------------------------------------------------------------===// 19495 // Combine some shuffles into subvector extracts and inserts: 19496 // 19497 19498 // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> 19499 if (isShuffleHigh128VectorInsertLow(SVOp)) { 19500 SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); 19501 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); 19502 return DCI.CombineTo(N, InsV); 19503 } 19504 19505 // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> 19506 if (isShuffleLow128VectorInsertHigh(SVOp)) { 19507 SDValue V = Extract128BitVector(V1, 0, DAG, dl); 19508 SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); 19509 return DCI.CombineTo(N, InsV); 19510 } 19511 19512 return SDValue(); 19513 } 19514 19515 /// \brief Combine an arbitrary chain of shuffles into a single instruction if 19516 /// possible. 19517 /// 19518 /// This is the leaf of the recursive combinine below. When we have found some 19519 /// chain of single-use x86 shuffle instructions and accumulated the combined 19520 /// shuffle mask represented by them, this will try to pattern match that mask 19521 /// into either a single instruction if there is a special purpose instruction 19522 /// for this operation, or into a PSHUFB instruction which is a fully general 19523 /// instruction but should only be used to replace chains over a certain depth. 19524 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, 19525 int Depth, bool HasPSHUFB, SelectionDAG &DAG, 19526 TargetLowering::DAGCombinerInfo &DCI, 19527 const X86Subtarget *Subtarget) { 19528 assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); 19529 19530 // Find the operand that enters the chain. Note that multiple uses are OK 19531 // here, we're not going to remove the operand we find. 19532 SDValue Input = Op.getOperand(0); 19533 while (Input.getOpcode() == ISD::BITCAST) 19534 Input = Input.getOperand(0); 19535 19536 MVT VT = Input.getSimpleValueType(); 19537 MVT RootVT = Root.getSimpleValueType(); 19538 SDLoc DL(Root); 19539 19540 // Just remove no-op shuffle masks. 19541 if (Mask.size() == 1) { 19542 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), 19543 /*AddTo*/ true); 19544 return true; 19545 } 19546 19547 // Use the float domain if the operand type is a floating point type. 19548 bool FloatDomain = VT.isFloatingPoint(); 19549 19550 // For floating point shuffles, we don't have free copies in the shuffle 19551 // instructions or the ability to load as part of the instruction, so 19552 // canonicalize their shuffles to UNPCK or MOV variants. 19553 // 19554 // Note that even with AVX we prefer the PSHUFD form of shuffle for integer 19555 // vectors because it can have a load folded into it that UNPCK cannot. This 19556 // doesn't preclude something switching to the shorter encoding post-RA. 19557 if (FloatDomain) { 19558 if (Mask.equals(0, 0) || Mask.equals(1, 1)) { 19559 bool Lo = Mask.equals(0, 0); 19560 unsigned Shuffle; 19561 MVT ShuffleVT; 19562 // Check if we have SSE3 which will let us use MOVDDUP. That instruction 19563 // is no slower than UNPCKLPD but has the option to fold the input operand 19564 // into even an unaligned memory load. 19565 if (Lo && Subtarget->hasSSE3()) { 19566 Shuffle = X86ISD::MOVDDUP; 19567 ShuffleVT = MVT::v2f64; 19568 } else { 19569 // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller 19570 // than the UNPCK variants. 19571 Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; 19572 ShuffleVT = MVT::v4f32; 19573 } 19574 if (Depth == 1 && Root->getOpcode() == Shuffle) 19575 return false; // Nothing to do! 19576 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19577 DCI.AddToWorklist(Op.getNode()); 19578 if (Shuffle == X86ISD::MOVDDUP) 19579 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); 19580 else 19581 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 19582 DCI.AddToWorklist(Op.getNode()); 19583 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19584 /*AddTo*/ true); 19585 return true; 19586 } 19587 if (Subtarget->hasSSE3() && 19588 (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { 19589 bool Lo = Mask.equals(0, 0, 2, 2); 19590 unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; 19591 MVT ShuffleVT = MVT::v4f32; 19592 if (Depth == 1 && Root->getOpcode() == Shuffle) 19593 return false; // Nothing to do! 19594 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19595 DCI.AddToWorklist(Op.getNode()); 19596 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); 19597 DCI.AddToWorklist(Op.getNode()); 19598 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19599 /*AddTo*/ true); 19600 return true; 19601 } 19602 if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { 19603 bool Lo = Mask.equals(0, 0, 1, 1); 19604 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; 19605 MVT ShuffleVT = MVT::v4f32; 19606 if (Depth == 1 && Root->getOpcode() == Shuffle) 19607 return false; // Nothing to do! 19608 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19609 DCI.AddToWorklist(Op.getNode()); 19610 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 19611 DCI.AddToWorklist(Op.getNode()); 19612 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19613 /*AddTo*/ true); 19614 return true; 19615 } 19616 } 19617 19618 // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK 19619 // variants as none of these have single-instruction variants that are 19620 // superior to the UNPCK formulation. 19621 if (!FloatDomain && 19622 (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || 19623 Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || 19624 Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || 19625 Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 19626 15))) { 19627 bool Lo = Mask[0] == 0; 19628 unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; 19629 if (Depth == 1 && Root->getOpcode() == Shuffle) 19630 return false; // Nothing to do! 19631 MVT ShuffleVT; 19632 switch (Mask.size()) { 19633 case 8: 19634 ShuffleVT = MVT::v8i16; 19635 break; 19636 case 16: 19637 ShuffleVT = MVT::v16i8; 19638 break; 19639 default: 19640 llvm_unreachable("Impossible mask size!"); 19641 }; 19642 Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); 19643 DCI.AddToWorklist(Op.getNode()); 19644 Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); 19645 DCI.AddToWorklist(Op.getNode()); 19646 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19647 /*AddTo*/ true); 19648 return true; 19649 } 19650 19651 // Don't try to re-form single instruction chains under any circumstances now 19652 // that we've done encoding canonicalization for them. 19653 if (Depth < 2) 19654 return false; 19655 19656 // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we 19657 // can replace them with a single PSHUFB instruction profitably. Intel's 19658 // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but 19659 // in practice PSHUFB tends to be *very* fast so we're more aggressive. 19660 if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { 19661 SmallVector<SDValue, 16> PSHUFBMask; 19662 assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); 19663 int Ratio = 16 / Mask.size(); 19664 for (unsigned i = 0; i < 16; ++i) { 19665 int M = Mask[i / Ratio] != SM_SentinelZero 19666 ? Ratio * Mask[i / Ratio] + i % Ratio 19667 : 255; 19668 PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); 19669 } 19670 Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); 19671 DCI.AddToWorklist(Op.getNode()); 19672 SDValue PSHUFBMaskOp = 19673 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); 19674 DCI.AddToWorklist(PSHUFBMaskOp.getNode()); 19675 Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); 19676 DCI.AddToWorklist(Op.getNode()); 19677 DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), 19678 /*AddTo*/ true); 19679 return true; 19680 } 19681 19682 // Failed to find any combines. 19683 return false; 19684 } 19685 19686 /// \brief Fully generic combining of x86 shuffle instructions. 19687 /// 19688 /// This should be the last combine run over the x86 shuffle instructions. Once 19689 /// they have been fully optimized, this will recursively consider all chains 19690 /// of single-use shuffle instructions, build a generic model of the cumulative 19691 /// shuffle operation, and check for simpler instructions which implement this 19692 /// operation. We use this primarily for two purposes: 19693 /// 19694 /// 1) Collapse generic shuffles to specialized single instructions when 19695 /// equivalent. In most cases, this is just an encoding size win, but 19696 /// sometimes we will collapse multiple generic shuffles into a single 19697 /// special-purpose shuffle. 19698 /// 2) Look for sequences of shuffle instructions with 3 or more total 19699 /// instructions, and replace them with the slightly more expensive SSSE3 19700 /// PSHUFB instruction if available. We do this as the last combining step 19701 /// to ensure we avoid using PSHUFB if we can implement the shuffle with 19702 /// a suitable short sequence of other instructions. The PHUFB will either 19703 /// use a register or have to read from memory and so is slightly (but only 19704 /// slightly) more expensive than the other shuffle instructions. 19705 /// 19706 /// Because this is inherently a quadratic operation (for each shuffle in 19707 /// a chain, we recurse up the chain), the depth is limited to 8 instructions. 19708 /// This should never be an issue in practice as the shuffle lowering doesn't 19709 /// produce sequences of more than 8 instructions. 19710 /// 19711 /// FIXME: We will currently miss some cases where the redundant shuffling 19712 /// would simplify under the threshold for PSHUFB formation because of 19713 /// combine-ordering. To fix this, we should do the redundant instruction 19714 /// combining in this recursive walk. 19715 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, 19716 ArrayRef<int> RootMask, 19717 int Depth, bool HasPSHUFB, 19718 SelectionDAG &DAG, 19719 TargetLowering::DAGCombinerInfo &DCI, 19720 const X86Subtarget *Subtarget) { 19721 // Bound the depth of our recursive combine because this is ultimately 19722 // quadratic in nature. 19723 if (Depth > 8) 19724 return false; 19725 19726 // Directly rip through bitcasts to find the underlying operand. 19727 while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) 19728 Op = Op.getOperand(0); 19729 19730 MVT VT = Op.getSimpleValueType(); 19731 if (!VT.isVector()) 19732 return false; // Bail if we hit a non-vector. 19733 // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit 19734 // version should be added. 19735 if (VT.getSizeInBits() != 128) 19736 return false; 19737 19738 assert(Root.getSimpleValueType().isVector() && 19739 "Shuffles operate on vector types!"); 19740 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && 19741 "Can only combine shuffles of the same vector register size."); 19742 19743 if (!isTargetShuffle(Op.getOpcode())) 19744 return false; 19745 SmallVector<int, 16> OpMask; 19746 bool IsUnary; 19747 bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); 19748 // We only can combine unary shuffles which we can decode the mask for. 19749 if (!HaveMask || !IsUnary) 19750 return false; 19751 19752 assert(VT.getVectorNumElements() == OpMask.size() && 19753 "Different mask size from vector size!"); 19754 assert(((RootMask.size() > OpMask.size() && 19755 RootMask.size() % OpMask.size() == 0) || 19756 (OpMask.size() > RootMask.size() && 19757 OpMask.size() % RootMask.size() == 0) || 19758 OpMask.size() == RootMask.size()) && 19759 "The smaller number of elements must divide the larger."); 19760 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); 19761 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); 19762 assert(((RootRatio == 1 && OpRatio == 1) || 19763 (RootRatio == 1) != (OpRatio == 1)) && 19764 "Must not have a ratio for both incoming and op masks!"); 19765 19766 SmallVector<int, 16> Mask; 19767 Mask.reserve(std::max(OpMask.size(), RootMask.size())); 19768 19769 // Merge this shuffle operation's mask into our accumulated mask. Note that 19770 // this shuffle's mask will be the first applied to the input, followed by the 19771 // root mask to get us all the way to the root value arrangement. The reason 19772 // for this order is that we are recursing up the operation chain. 19773 for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { 19774 int RootIdx = i / RootRatio; 19775 if (RootMask[RootIdx] == SM_SentinelZero) { 19776 // This is a zero-ed lane, we're done. 19777 Mask.push_back(SM_SentinelZero); 19778 continue; 19779 } 19780 19781 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; 19782 int OpIdx = RootMaskedIdx / OpRatio; 19783 if (OpMask[OpIdx] == SM_SentinelZero) { 19784 // The incoming lanes are zero, it doesn't matter which ones we are using. 19785 Mask.push_back(SM_SentinelZero); 19786 continue; 19787 } 19788 19789 // Ok, we have non-zero lanes, map them through. 19790 Mask.push_back(OpMask[OpIdx] * OpRatio + 19791 RootMaskedIdx % OpRatio); 19792 } 19793 19794 // See if we can recurse into the operand to combine more things. 19795 switch (Op.getOpcode()) { 19796 case X86ISD::PSHUFB: 19797 HasPSHUFB = true; 19798 case X86ISD::PSHUFD: 19799 case X86ISD::PSHUFHW: 19800 case X86ISD::PSHUFLW: 19801 if (Op.getOperand(0).hasOneUse() && 19802 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, 19803 HasPSHUFB, DAG, DCI, Subtarget)) 19804 return true; 19805 break; 19806 19807 case X86ISD::UNPCKL: 19808 case X86ISD::UNPCKH: 19809 assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); 19810 // We can't check for single use, we have to check that this shuffle is the only user. 19811 if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && 19812 combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, 19813 HasPSHUFB, DAG, DCI, Subtarget)) 19814 return true; 19815 break; 19816 } 19817 19818 // Minor canonicalization of the accumulated shuffle mask to make it easier 19819 // to match below. All this does is detect masks with squential pairs of 19820 // elements, and shrink them to the half-width mask. It does this in a loop 19821 // so it will reduce the size of the mask to the minimal width mask which 19822 // performs an equivalent shuffle. 19823 while (Mask.size() > 1 && canWidenShuffleElements(Mask)) { 19824 for (int i = 0, e = Mask.size() / 2; i < e; ++i) 19825 Mask[i] = Mask[2 * i] / 2; 19826 Mask.resize(Mask.size() / 2); 19827 } 19828 19829 return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, 19830 Subtarget); 19831 } 19832 19833 /// \brief Get the PSHUF-style mask from PSHUF node. 19834 /// 19835 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 19836 /// PSHUF-style masks that can be reused with such instructions. 19837 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { 19838 SmallVector<int, 4> Mask; 19839 bool IsUnary; 19840 bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); 19841 (void)HaveMask; 19842 assert(HaveMask); 19843 19844 switch (N.getOpcode()) { 19845 case X86ISD::PSHUFD: 19846 return Mask; 19847 case X86ISD::PSHUFLW: 19848 Mask.resize(4); 19849 return Mask; 19850 case X86ISD::PSHUFHW: 19851 Mask.erase(Mask.begin(), Mask.begin() + 4); 19852 for (int &M : Mask) 19853 M -= 4; 19854 return Mask; 19855 default: 19856 llvm_unreachable("No valid shuffle instruction found!"); 19857 } 19858 } 19859 19860 /// \brief Search for a combinable shuffle across a chain ending in pshufd. 19861 /// 19862 /// We walk up the chain and look for a combinable shuffle, skipping over 19863 /// shuffles that we could hoist this shuffle's transformation past without 19864 /// altering anything. 19865 static SDValue 19866 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, 19867 SelectionDAG &DAG, 19868 TargetLowering::DAGCombinerInfo &DCI) { 19869 assert(N.getOpcode() == X86ISD::PSHUFD && 19870 "Called with something other than an x86 128-bit half shuffle!"); 19871 SDLoc DL(N); 19872 19873 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack 19874 // of the shuffles in the chain so that we can form a fresh chain to replace 19875 // this one. 19876 SmallVector<SDValue, 8> Chain; 19877 SDValue V = N.getOperand(0); 19878 for (; V.hasOneUse(); V = V.getOperand(0)) { 19879 switch (V.getOpcode()) { 19880 default: 19881 return SDValue(); // Nothing combined! 19882 19883 case ISD::BITCAST: 19884 // Skip bitcasts as we always know the type for the target specific 19885 // instructions. 19886 continue; 19887 19888 case X86ISD::PSHUFD: 19889 // Found another dword shuffle. 19890 break; 19891 19892 case X86ISD::PSHUFLW: 19893 // Check that the low words (being shuffled) are the identity in the 19894 // dword shuffle, and the high words are self-contained. 19895 if (Mask[0] != 0 || Mask[1] != 1 || 19896 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) 19897 return SDValue(); 19898 19899 Chain.push_back(V); 19900 continue; 19901 19902 case X86ISD::PSHUFHW: 19903 // Check that the high words (being shuffled) are the identity in the 19904 // dword shuffle, and the low words are self-contained. 19905 if (Mask[2] != 2 || Mask[3] != 3 || 19906 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) 19907 return SDValue(); 19908 19909 Chain.push_back(V); 19910 continue; 19911 19912 case X86ISD::UNPCKL: 19913 case X86ISD::UNPCKH: 19914 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword 19915 // shuffle into a preceding word shuffle. 19916 if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) 19917 return SDValue(); 19918 19919 // Search for a half-shuffle which we can combine with. 19920 unsigned CombineOp = 19921 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; 19922 if (V.getOperand(0) != V.getOperand(1) || 19923 !V->isOnlyUserOf(V.getOperand(0).getNode())) 19924 return SDValue(); 19925 Chain.push_back(V); 19926 V = V.getOperand(0); 19927 do { 19928 switch (V.getOpcode()) { 19929 default: 19930 return SDValue(); // Nothing to combine. 19931 19932 case X86ISD::PSHUFLW: 19933 case X86ISD::PSHUFHW: 19934 if (V.getOpcode() == CombineOp) 19935 break; 19936 19937 Chain.push_back(V); 19938 19939 // Fallthrough! 19940 case ISD::BITCAST: 19941 V = V.getOperand(0); 19942 continue; 19943 } 19944 break; 19945 } while (V.hasOneUse()); 19946 break; 19947 } 19948 // Break out of the loop if we break out of the switch. 19949 break; 19950 } 19951 19952 if (!V.hasOneUse()) 19953 // We fell out of the loop without finding a viable combining instruction. 19954 return SDValue(); 19955 19956 // Merge this node's mask and our incoming mask. 19957 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 19958 for (int &M : Mask) 19959 M = VMask[M]; 19960 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), 19961 getV4X86ShuffleImm8ForMask(Mask, DAG)); 19962 19963 // Rebuild the chain around this new shuffle. 19964 while (!Chain.empty()) { 19965 SDValue W = Chain.pop_back_val(); 19966 19967 if (V.getValueType() != W.getOperand(0).getValueType()) 19968 V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); 19969 19970 switch (W.getOpcode()) { 19971 default: 19972 llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); 19973 19974 case X86ISD::UNPCKL: 19975 case X86ISD::UNPCKH: 19976 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); 19977 break; 19978 19979 case X86ISD::PSHUFD: 19980 case X86ISD::PSHUFLW: 19981 case X86ISD::PSHUFHW: 19982 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); 19983 break; 19984 } 19985 } 19986 if (V.getValueType() != N.getValueType()) 19987 V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); 19988 19989 // Return the new chain to replace N. 19990 return V; 19991 } 19992 19993 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. 19994 /// 19995 /// We walk up the chain, skipping shuffles of the other half and looking 19996 /// through shuffles which switch halves trying to find a shuffle of the same 19997 /// pair of dwords. 19998 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, 19999 SelectionDAG &DAG, 20000 TargetLowering::DAGCombinerInfo &DCI) { 20001 assert( 20002 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && 20003 "Called with something other than an x86 128-bit half shuffle!"); 20004 SDLoc DL(N); 20005 unsigned CombineOpcode = N.getOpcode(); 20006 20007 // Walk up a single-use chain looking for a combinable shuffle. 20008 SDValue V = N.getOperand(0); 20009 for (; V.hasOneUse(); V = V.getOperand(0)) { 20010 switch (V.getOpcode()) { 20011 default: 20012 return false; // Nothing combined! 20013 20014 case ISD::BITCAST: 20015 // Skip bitcasts as we always know the type for the target specific 20016 // instructions. 20017 continue; 20018 20019 case X86ISD::PSHUFLW: 20020 case X86ISD::PSHUFHW: 20021 if (V.getOpcode() == CombineOpcode) 20022 break; 20023 20024 // Other-half shuffles are no-ops. 20025 continue; 20026 } 20027 // Break out of the loop if we break out of the switch. 20028 break; 20029 } 20030 20031 if (!V.hasOneUse()) 20032 // We fell out of the loop without finding a viable combining instruction. 20033 return false; 20034 20035 // Combine away the bottom node as its shuffle will be accumulated into 20036 // a preceding shuffle. 20037 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 20038 20039 // Record the old value. 20040 SDValue Old = V; 20041 20042 // Merge this node's mask and our incoming mask (adjusted to account for all 20043 // the pshufd instructions encountered). 20044 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 20045 for (int &M : Mask) 20046 M = VMask[M]; 20047 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), 20048 getV4X86ShuffleImm8ForMask(Mask, DAG)); 20049 20050 // Check that the shuffles didn't cancel each other out. If not, we need to 20051 // combine to the new one. 20052 if (Old != V) 20053 // Replace the combinable shuffle with the combined one, updating all users 20054 // so that we re-evaluate the chain here. 20055 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); 20056 20057 return true; 20058 } 20059 20060 /// \brief Try to combine x86 target specific shuffles. 20061 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, 20062 TargetLowering::DAGCombinerInfo &DCI, 20063 const X86Subtarget *Subtarget) { 20064 SDLoc DL(N); 20065 MVT VT = N.getSimpleValueType(); 20066 SmallVector<int, 4> Mask; 20067 20068 switch (N.getOpcode()) { 20069 case X86ISD::PSHUFD: 20070 case X86ISD::PSHUFLW: 20071 case X86ISD::PSHUFHW: 20072 Mask = getPSHUFShuffleMask(N); 20073 assert(Mask.size() == 4); 20074 break; 20075 default: 20076 return SDValue(); 20077 } 20078 20079 // Nuke no-op shuffles that show up after combining. 20080 if (isNoopShuffleMask(Mask)) 20081 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); 20082 20083 // Look for simplifications involving one or two shuffle instructions. 20084 SDValue V = N.getOperand(0); 20085 switch (N.getOpcode()) { 20086 default: 20087 break; 20088 case X86ISD::PSHUFLW: 20089 case X86ISD::PSHUFHW: 20090 assert(VT == MVT::v8i16); 20091 (void)VT; 20092 20093 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) 20094 return SDValue(); // We combined away this shuffle, so we're done. 20095 20096 // See if this reduces to a PSHUFD which is no more expensive and can 20097 // combine with more operations. 20098 if (canWidenShuffleElements(Mask)) { 20099 int DMask[] = {-1, -1, -1, -1}; 20100 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; 20101 DMask[DOffset + 0] = DOffset + Mask[0] / 2; 20102 DMask[DOffset + 1] = DOffset + Mask[2] / 2; 20103 V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); 20104 DCI.AddToWorklist(V.getNode()); 20105 V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, 20106 getV4X86ShuffleImm8ForMask(DMask, DAG)); 20107 DCI.AddToWorklist(V.getNode()); 20108 return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); 20109 } 20110 20111 // Look for shuffle patterns which can be implemented as a single unpack. 20112 // FIXME: This doesn't handle the location of the PSHUFD generically, and 20113 // only works when we have a PSHUFD followed by two half-shuffles. 20114 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && 20115 (V.getOpcode() == X86ISD::PSHUFLW || 20116 V.getOpcode() == X86ISD::PSHUFHW) && 20117 V.getOpcode() != N.getOpcode() && 20118 V.hasOneUse()) { 20119 SDValue D = V.getOperand(0); 20120 while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) 20121 D = D.getOperand(0); 20122 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { 20123 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); 20124 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); 20125 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 20126 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; 20127 int WordMask[8]; 20128 for (int i = 0; i < 4; ++i) { 20129 WordMask[i + NOffset] = Mask[i] + NOffset; 20130 WordMask[i + VOffset] = VMask[i] + VOffset; 20131 } 20132 // Map the word mask through the DWord mask. 20133 int MappedMask[8]; 20134 for (int i = 0; i < 8; ++i) 20135 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; 20136 const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; 20137 const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; 20138 if (std::equal(std::begin(MappedMask), std::end(MappedMask), 20139 std::begin(UnpackLoMask)) || 20140 std::equal(std::begin(MappedMask), std::end(MappedMask), 20141 std::begin(UnpackHiMask))) { 20142 // We can replace all three shuffles with an unpack. 20143 V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); 20144 DCI.AddToWorklist(V.getNode()); 20145 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL 20146 : X86ISD::UNPCKH, 20147 DL, MVT::v8i16, V, V); 20148 } 20149 } 20150 } 20151 20152 break; 20153 20154 case X86ISD::PSHUFD: 20155 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) 20156 return NewN; 20157 20158 break; 20159 } 20160 20161 return SDValue(); 20162 } 20163 20164 /// \brief Try to combine a shuffle into a target-specific add-sub node. 20165 /// 20166 /// We combine this directly on the abstract vector shuffle nodes so it is 20167 /// easier to generically match. We also insert dummy vector shuffle nodes for 20168 /// the operands which explicitly discard the lanes which are unused by this 20169 /// operation to try to flow through the rest of the combiner the fact that 20170 /// they're unused. 20171 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { 20172 SDLoc DL(N); 20173 EVT VT = N->getValueType(0); 20174 20175 // We only handle target-independent shuffles. 20176 // FIXME: It would be easy and harmless to use the target shuffle mask 20177 // extraction tool to support more. 20178 if (N->getOpcode() != ISD::VECTOR_SHUFFLE) 20179 return SDValue(); 20180 20181 auto *SVN = cast<ShuffleVectorSDNode>(N); 20182 ArrayRef<int> Mask = SVN->getMask(); 20183 SDValue V1 = N->getOperand(0); 20184 SDValue V2 = N->getOperand(1); 20185 20186 // We require the first shuffle operand to be the SUB node, and the second to 20187 // be the ADD node. 20188 // FIXME: We should support the commuted patterns. 20189 if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) 20190 return SDValue(); 20191 20192 // If there are other uses of these operations we can't fold them. 20193 if (!V1->hasOneUse() || !V2->hasOneUse()) 20194 return SDValue(); 20195 20196 // Ensure that both operations have the same operands. Note that we can 20197 // commute the FADD operands. 20198 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); 20199 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && 20200 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) 20201 return SDValue(); 20202 20203 // We're looking for blends between FADD and FSUB nodes. We insist on these 20204 // nodes being lined up in a specific expected pattern. 20205 if (!(isShuffleEquivalent(Mask, 0, 3) || 20206 isShuffleEquivalent(Mask, 0, 5, 2, 7) || 20207 isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) 20208 return SDValue(); 20209 20210 // Only specific types are legal at this point, assert so we notice if and 20211 // when these change. 20212 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || 20213 VT == MVT::v4f64) && 20214 "Unknown vector type encountered!"); 20215 20216 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); 20217 } 20218 20219 /// PerformShuffleCombine - Performs several different shuffle combines. 20220 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 20221 TargetLowering::DAGCombinerInfo &DCI, 20222 const X86Subtarget *Subtarget) { 20223 SDLoc dl(N); 20224 SDValue N0 = N->getOperand(0); 20225 SDValue N1 = N->getOperand(1); 20226 EVT VT = N->getValueType(0); 20227 20228 // Don't create instructions with illegal types after legalize types has run. 20229 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20230 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 20231 return SDValue(); 20232 20233 // If we have legalized the vector types, look for blends of FADD and FSUB 20234 // nodes that we can fuse into an ADDSUB node. 20235 if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) 20236 if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) 20237 return AddSub; 20238 20239 // Combine 256-bit vector shuffles. This is only profitable when in AVX mode 20240 if (Subtarget->hasFp256() && VT.is256BitVector() && 20241 N->getOpcode() == ISD::VECTOR_SHUFFLE) 20242 return PerformShuffleCombine256(N, DAG, DCI, Subtarget); 20243 20244 // During Type Legalization, when promoting illegal vector types, 20245 // the backend might introduce new shuffle dag nodes and bitcasts. 20246 // 20247 // This code performs the following transformation: 20248 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> 20249 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) 20250 // 20251 // We do this only if both the bitcast and the BINOP dag nodes have 20252 // one use. Also, perform this transformation only if the new binary 20253 // operation is legal. This is to avoid introducing dag nodes that 20254 // potentially need to be further expanded (or custom lowered) into a 20255 // less optimal sequence of dag nodes. 20256 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && 20257 N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && 20258 N0.getOpcode() == ISD::BITCAST) { 20259 SDValue BC0 = N0.getOperand(0); 20260 EVT SVT = BC0.getValueType(); 20261 unsigned Opcode = BC0.getOpcode(); 20262 unsigned NumElts = VT.getVectorNumElements(); 20263 20264 if (BC0.hasOneUse() && SVT.isVector() && 20265 SVT.getVectorNumElements() * 2 == NumElts && 20266 TLI.isOperationLegal(Opcode, VT)) { 20267 bool CanFold = false; 20268 switch (Opcode) { 20269 default : break; 20270 case ISD::ADD : 20271 case ISD::FADD : 20272 case ISD::SUB : 20273 case ISD::FSUB : 20274 case ISD::MUL : 20275 case ISD::FMUL : 20276 CanFold = true; 20277 } 20278 20279 unsigned SVTNumElts = SVT.getVectorNumElements(); 20280 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 20281 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) 20282 CanFold = SVOp->getMaskElt(i) == (int)(i * 2); 20283 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) 20284 CanFold = SVOp->getMaskElt(i) < 0; 20285 20286 if (CanFold) { 20287 SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0)); 20288 SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1)); 20289 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); 20290 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); 20291 } 20292 } 20293 } 20294 20295 // Only handle 128 wide vector from here on. 20296 if (!VT.is128BitVector()) 20297 return SDValue(); 20298 20299 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 20300 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 20301 // consecutive, non-overlapping, and in the right order. 20302 SmallVector<SDValue, 16> Elts; 20303 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 20304 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 20305 20306 SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); 20307 if (LD.getNode()) 20308 return LD; 20309 20310 if (isTargetShuffle(N->getOpcode())) { 20311 SDValue Shuffle = 20312 PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); 20313 if (Shuffle.getNode()) 20314 return Shuffle; 20315 20316 // Try recursively combining arbitrary sequences of x86 shuffle 20317 // instructions into higher-order shuffles. We do this after combining 20318 // specific PSHUF instruction sequences into their minimal form so that we 20319 // can evaluate how many specialized shuffle instructions are involved in 20320 // a particular chain. 20321 SmallVector<int, 1> NonceMask; // Just a placeholder. 20322 NonceMask.push_back(0); 20323 if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, 20324 /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, 20325 DCI, Subtarget)) 20326 return SDValue(); // This routine will use CombineTo to replace N. 20327 } 20328 20329 return SDValue(); 20330 } 20331 20332 /// PerformTruncateCombine - Converts truncate operation to 20333 /// a sequence of vector shuffle operations. 20334 /// It is possible when we truncate 256-bit vector to 128-bit vector 20335 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 20336 TargetLowering::DAGCombinerInfo &DCI, 20337 const X86Subtarget *Subtarget) { 20338 return SDValue(); 20339 } 20340 20341 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target 20342 /// specific shuffle of a load can be folded into a single element load. 20343 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but 20344 /// shuffles have been customed lowered so we need to handle those here. 20345 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, 20346 TargetLowering::DAGCombinerInfo &DCI) { 20347 if (DCI.isBeforeLegalizeOps()) 20348 return SDValue(); 20349 20350 SDValue InVec = N->getOperand(0); 20351 SDValue EltNo = N->getOperand(1); 20352 20353 if (!isa<ConstantSDNode>(EltNo)) 20354 return SDValue(); 20355 20356 EVT VT = InVec.getValueType(); 20357 20358 if (InVec.getOpcode() == ISD::BITCAST) { 20359 // Don't duplicate a load with other uses. 20360 if (!InVec.hasOneUse()) 20361 return SDValue(); 20362 EVT BCVT = InVec.getOperand(0).getValueType(); 20363 if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) 20364 return SDValue(); 20365 InVec = InVec.getOperand(0); 20366 } 20367 20368 if (!isTargetShuffle(InVec.getOpcode())) 20369 return SDValue(); 20370 20371 // Don't duplicate a load with other uses. 20372 if (!InVec.hasOneUse()) 20373 return SDValue(); 20374 20375 SmallVector<int, 16> ShuffleMask; 20376 bool UnaryShuffle; 20377 if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, 20378 UnaryShuffle)) 20379 return SDValue(); 20380 20381 // Select the input vector, guarding against out of range extract vector. 20382 unsigned NumElems = VT.getVectorNumElements(); 20383 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 20384 int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; 20385 SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) 20386 : InVec.getOperand(1); 20387 20388 // If inputs to shuffle are the same for both ops, then allow 2 uses 20389 unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; 20390 20391 if (LdNode.getOpcode() == ISD::BITCAST) { 20392 // Don't duplicate a load with other uses. 20393 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) 20394 return SDValue(); 20395 20396 AllowedUses = 1; // only allow 1 load use if we have a bitcast 20397 LdNode = LdNode.getOperand(0); 20398 } 20399 20400 if (!ISD::isNormalLoad(LdNode.getNode())) 20401 return SDValue(); 20402 20403 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); 20404 20405 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) 20406 return SDValue(); 20407 20408 EVT EltVT = N->getValueType(0); 20409 // If there's a bitcast before the shuffle, check if the load type and 20410 // alignment is valid. 20411 unsigned Align = LN0->getAlignment(); 20412 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20413 unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( 20414 EltVT.getTypeForEVT(*DAG.getContext())); 20415 20416 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) 20417 return SDValue(); 20418 20419 // All checks match so transform back to vector_shuffle so that DAG combiner 20420 // can finish the job 20421 SDLoc dl(N); 20422 20423 // Create shuffle node taking into account the case that its a unary shuffle 20424 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); 20425 Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, 20426 InVec.getOperand(0), Shuffle, 20427 &ShuffleMask[0]); 20428 Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 20429 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, 20430 EltNo); 20431 } 20432 20433 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 20434 /// generation and convert it from being a bunch of shuffles and extracts 20435 /// to a simple store and scalar loads to extract the elements. 20436 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 20437 TargetLowering::DAGCombinerInfo &DCI) { 20438 SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); 20439 if (NewOp.getNode()) 20440 return NewOp; 20441 20442 SDValue InputVector = N->getOperand(0); 20443 20444 // Detect whether we are trying to convert from mmx to i32 and the bitcast 20445 // from mmx to v2i32 has a single usage. 20446 if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && 20447 InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && 20448 InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) 20449 return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), 20450 N->getValueType(0), 20451 InputVector.getNode()->getOperand(0)); 20452 20453 // Only operate on vectors of 4 elements, where the alternative shuffling 20454 // gets to be more expensive. 20455 if (InputVector.getValueType() != MVT::v4i32) 20456 return SDValue(); 20457 20458 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 20459 // single use which is a sign-extend or zero-extend, and all elements are 20460 // used. 20461 SmallVector<SDNode *, 4> Uses; 20462 unsigned ExtractedElements = 0; 20463 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 20464 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 20465 if (UI.getUse().getResNo() != InputVector.getResNo()) 20466 return SDValue(); 20467 20468 SDNode *Extract = *UI; 20469 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 20470 return SDValue(); 20471 20472 if (Extract->getValueType(0) != MVT::i32) 20473 return SDValue(); 20474 if (!Extract->hasOneUse()) 20475 return SDValue(); 20476 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 20477 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 20478 return SDValue(); 20479 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 20480 return SDValue(); 20481 20482 // Record which element was extracted. 20483 ExtractedElements |= 20484 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 20485 20486 Uses.push_back(Extract); 20487 } 20488 20489 // If not all the elements were used, this may not be worthwhile. 20490 if (ExtractedElements != 15) 20491 return SDValue(); 20492 20493 // Ok, we've now decided to do the transformation. 20494 SDLoc dl(InputVector); 20495 20496 // Store the value to a temporary stack slot. 20497 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 20498 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 20499 MachinePointerInfo(), false, false, 0); 20500 20501 // Replace each use (extract) with a load of the appropriate element. 20502 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 20503 UE = Uses.end(); UI != UE; ++UI) { 20504 SDNode *Extract = *UI; 20505 20506 // cOMpute the element's address. 20507 SDValue Idx = Extract->getOperand(1); 20508 unsigned EltSize = 20509 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 20510 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 20511 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20512 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 20513 20514 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 20515 StackPtr, OffsetVal); 20516 20517 // Load the scalar. 20518 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 20519 ScalarAddr, MachinePointerInfo(), 20520 false, false, false, 0); 20521 20522 // Replace the exact with the load. 20523 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 20524 } 20525 20526 // The replacement was made in place; don't return anything. 20527 return SDValue(); 20528 } 20529 20530 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. 20531 static std::pair<unsigned, bool> 20532 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, 20533 SelectionDAG &DAG, const X86Subtarget *Subtarget) { 20534 if (!VT.isVector()) 20535 return std::make_pair(0, false); 20536 20537 bool NeedSplit = false; 20538 switch (VT.getSimpleVT().SimpleTy) { 20539 default: return std::make_pair(0, false); 20540 case MVT::v32i8: 20541 case MVT::v16i16: 20542 case MVT::v8i32: 20543 if (!Subtarget->hasAVX2()) 20544 NeedSplit = true; 20545 if (!Subtarget->hasAVX()) 20546 return std::make_pair(0, false); 20547 break; 20548 case MVT::v16i8: 20549 case MVT::v8i16: 20550 case MVT::v4i32: 20551 if (!Subtarget->hasSSE2()) 20552 return std::make_pair(0, false); 20553 } 20554 20555 // SSE2 has only a small subset of the operations. 20556 bool hasUnsigned = Subtarget->hasSSE41() || 20557 (Subtarget->hasSSE2() && VT == MVT::v16i8); 20558 bool hasSigned = Subtarget->hasSSE41() || 20559 (Subtarget->hasSSE2() && VT == MVT::v8i16); 20560 20561 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 20562 20563 unsigned Opc = 0; 20564 // Check for x CC y ? x : y. 20565 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 20566 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 20567 switch (CC) { 20568 default: break; 20569 case ISD::SETULT: 20570 case ISD::SETULE: 20571 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 20572 case ISD::SETUGT: 20573 case ISD::SETUGE: 20574 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 20575 case ISD::SETLT: 20576 case ISD::SETLE: 20577 Opc = hasSigned ? X86ISD::SMIN : 0; break; 20578 case ISD::SETGT: 20579 case ISD::SETGE: 20580 Opc = hasSigned ? X86ISD::SMAX : 0; break; 20581 } 20582 // Check for x CC y ? y : x -- a min/max with reversed arms. 20583 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 20584 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 20585 switch (CC) { 20586 default: break; 20587 case ISD::SETULT: 20588 case ISD::SETULE: 20589 Opc = hasUnsigned ? X86ISD::UMAX : 0; break; 20590 case ISD::SETUGT: 20591 case ISD::SETUGE: 20592 Opc = hasUnsigned ? X86ISD::UMIN : 0; break; 20593 case ISD::SETLT: 20594 case ISD::SETLE: 20595 Opc = hasSigned ? X86ISD::SMAX : 0; break; 20596 case ISD::SETGT: 20597 case ISD::SETGE: 20598 Opc = hasSigned ? X86ISD::SMIN : 0; break; 20599 } 20600 } 20601 20602 return std::make_pair(Opc, NeedSplit); 20603 } 20604 20605 static SDValue 20606 TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, 20607 const X86Subtarget *Subtarget) { 20608 SDLoc dl(N); 20609 SDValue Cond = N->getOperand(0); 20610 SDValue LHS = N->getOperand(1); 20611 SDValue RHS = N->getOperand(2); 20612 20613 if (Cond.getOpcode() == ISD::SIGN_EXTEND) { 20614 SDValue CondSrc = Cond->getOperand(0); 20615 if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) 20616 Cond = CondSrc->getOperand(0); 20617 } 20618 20619 MVT VT = N->getSimpleValueType(0); 20620 MVT EltVT = VT.getVectorElementType(); 20621 unsigned NumElems = VT.getVectorNumElements(); 20622 // There is no blend with immediate in AVX-512. 20623 if (VT.is512BitVector()) 20624 return SDValue(); 20625 20626 if (!Subtarget->hasSSE41() || EltVT == MVT::i8) 20627 return SDValue(); 20628 if (!Subtarget->hasInt256() && VT == MVT::v16i16) 20629 return SDValue(); 20630 20631 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) 20632 return SDValue(); 20633 20634 // A vselect where all conditions and data are constants can be optimized into 20635 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). 20636 if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && 20637 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) 20638 return SDValue(); 20639 20640 unsigned MaskValue = 0; 20641 if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) 20642 return SDValue(); 20643 20644 SmallVector<int, 8> ShuffleMask(NumElems, -1); 20645 for (unsigned i = 0; i < NumElems; ++i) { 20646 // Be sure we emit undef where we can. 20647 if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) 20648 ShuffleMask[i] = -1; 20649 else 20650 ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); 20651 } 20652 20653 return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); 20654 } 20655 20656 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT 20657 /// nodes. 20658 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 20659 TargetLowering::DAGCombinerInfo &DCI, 20660 const X86Subtarget *Subtarget) { 20661 SDLoc DL(N); 20662 SDValue Cond = N->getOperand(0); 20663 // Get the LHS/RHS of the select. 20664 SDValue LHS = N->getOperand(1); 20665 SDValue RHS = N->getOperand(2); 20666 EVT VT = LHS.getValueType(); 20667 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 20668 20669 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 20670 // instructions match the semantics of the common C idiom x<y?x:y but not 20671 // x<=y?x:y, because of how they handle negative zero (which can be 20672 // ignored in unsafe-math mode). 20673 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && 20674 VT != MVT::f80 && TLI.isTypeLegal(VT) && 20675 (Subtarget->hasSSE2() || 20676 (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { 20677 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 20678 20679 unsigned Opcode = 0; 20680 // Check for x CC y ? x : y. 20681 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 20682 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 20683 switch (CC) { 20684 default: break; 20685 case ISD::SETULT: 20686 // Converting this to a min would handle NaNs incorrectly, and swapping 20687 // the operands would cause it to handle comparisons between positive 20688 // and negative zero incorrectly. 20689 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 20690 if (!DAG.getTarget().Options.UnsafeFPMath && 20691 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 20692 break; 20693 std::swap(LHS, RHS); 20694 } 20695 Opcode = X86ISD::FMIN; 20696 break; 20697 case ISD::SETOLE: 20698 // Converting this to a min would handle comparisons between positive 20699 // and negative zero incorrectly. 20700 if (!DAG.getTarget().Options.UnsafeFPMath && 20701 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 20702 break; 20703 Opcode = X86ISD::FMIN; 20704 break; 20705 case ISD::SETULE: 20706 // Converting this to a min would handle both negative zeros and NaNs 20707 // incorrectly, but we can swap the operands to fix both. 20708 std::swap(LHS, RHS); 20709 case ISD::SETOLT: 20710 case ISD::SETLT: 20711 case ISD::SETLE: 20712 Opcode = X86ISD::FMIN; 20713 break; 20714 20715 case ISD::SETOGE: 20716 // Converting this to a max would handle comparisons between positive 20717 // and negative zero incorrectly. 20718 if (!DAG.getTarget().Options.UnsafeFPMath && 20719 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 20720 break; 20721 Opcode = X86ISD::FMAX; 20722 break; 20723 case ISD::SETUGT: 20724 // Converting this to a max would handle NaNs incorrectly, and swapping 20725 // the operands would cause it to handle comparisons between positive 20726 // and negative zero incorrectly. 20727 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 20728 if (!DAG.getTarget().Options.UnsafeFPMath && 20729 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 20730 break; 20731 std::swap(LHS, RHS); 20732 } 20733 Opcode = X86ISD::FMAX; 20734 break; 20735 case ISD::SETUGE: 20736 // Converting this to a max would handle both negative zeros and NaNs 20737 // incorrectly, but we can swap the operands to fix both. 20738 std::swap(LHS, RHS); 20739 case ISD::SETOGT: 20740 case ISD::SETGT: 20741 case ISD::SETGE: 20742 Opcode = X86ISD::FMAX; 20743 break; 20744 } 20745 // Check for x CC y ? y : x -- a min/max with reversed arms. 20746 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 20747 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 20748 switch (CC) { 20749 default: break; 20750 case ISD::SETOGE: 20751 // Converting this to a min would handle comparisons between positive 20752 // and negative zero incorrectly, and swapping the operands would 20753 // cause it to handle NaNs incorrectly. 20754 if (!DAG.getTarget().Options.UnsafeFPMath && 20755 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 20756 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 20757 break; 20758 std::swap(LHS, RHS); 20759 } 20760 Opcode = X86ISD::FMIN; 20761 break; 20762 case ISD::SETUGT: 20763 // Converting this to a min would handle NaNs incorrectly. 20764 if (!DAG.getTarget().Options.UnsafeFPMath && 20765 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 20766 break; 20767 Opcode = X86ISD::FMIN; 20768 break; 20769 case ISD::SETUGE: 20770 // Converting this to a min would handle both negative zeros and NaNs 20771 // incorrectly, but we can swap the operands to fix both. 20772 std::swap(LHS, RHS); 20773 case ISD::SETOGT: 20774 case ISD::SETGT: 20775 case ISD::SETGE: 20776 Opcode = X86ISD::FMIN; 20777 break; 20778 20779 case ISD::SETULT: 20780 // Converting this to a max would handle NaNs incorrectly. 20781 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 20782 break; 20783 Opcode = X86ISD::FMAX; 20784 break; 20785 case ISD::SETOLE: 20786 // Converting this to a max would handle comparisons between positive 20787 // and negative zero incorrectly, and swapping the operands would 20788 // cause it to handle NaNs incorrectly. 20789 if (!DAG.getTarget().Options.UnsafeFPMath && 20790 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 20791 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 20792 break; 20793 std::swap(LHS, RHS); 20794 } 20795 Opcode = X86ISD::FMAX; 20796 break; 20797 case ISD::SETULE: 20798 // Converting this to a max would handle both negative zeros and NaNs 20799 // incorrectly, but we can swap the operands to fix both. 20800 std::swap(LHS, RHS); 20801 case ISD::SETOLT: 20802 case ISD::SETLT: 20803 case ISD::SETLE: 20804 Opcode = X86ISD::FMAX; 20805 break; 20806 } 20807 } 20808 20809 if (Opcode) 20810 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 20811 } 20812 20813 EVT CondVT = Cond.getValueType(); 20814 if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && 20815 CondVT.getVectorElementType() == MVT::i1) { 20816 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper 20817 // lowering on KNL. In this case we convert it to 20818 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. 20819 // The same situation for all 128 and 256-bit vectors of i8 and i16. 20820 // Since SKX these selects have a proper lowering. 20821 EVT OpVT = LHS.getValueType(); 20822 if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && 20823 (OpVT.getVectorElementType() == MVT::i8 || 20824 OpVT.getVectorElementType() == MVT::i16) && 20825 !(Subtarget->hasBWI() && Subtarget->hasVLX())) { 20826 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); 20827 DCI.AddToWorklist(Cond.getNode()); 20828 return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); 20829 } 20830 } 20831 // If this is a select between two integer constants, try to do some 20832 // optimizations. 20833 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 20834 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 20835 // Don't do this for crazy integer types. 20836 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 20837 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 20838 // so that TrueC (the true value) is larger than FalseC. 20839 bool NeedsCondInvert = false; 20840 20841 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 20842 // Efficiently invertible. 20843 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 20844 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 20845 isa<ConstantSDNode>(Cond.getOperand(1))))) { 20846 NeedsCondInvert = true; 20847 std::swap(TrueC, FalseC); 20848 } 20849 20850 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 20851 if (FalseC->getAPIntValue() == 0 && 20852 TrueC->getAPIntValue().isPowerOf2()) { 20853 if (NeedsCondInvert) // Invert the condition if needed. 20854 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 20855 DAG.getConstant(1, Cond.getValueType())); 20856 20857 // Zero extend the condition if needed. 20858 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 20859 20860 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 20861 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 20862 DAG.getConstant(ShAmt, MVT::i8)); 20863 } 20864 20865 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 20866 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 20867 if (NeedsCondInvert) // Invert the condition if needed. 20868 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 20869 DAG.getConstant(1, Cond.getValueType())); 20870 20871 // Zero extend the condition if needed. 20872 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 20873 FalseC->getValueType(0), Cond); 20874 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 20875 SDValue(FalseC, 0)); 20876 } 20877 20878 // Optimize cases that will turn into an LEA instruction. This requires 20879 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 20880 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 20881 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 20882 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 20883 20884 bool isFastMultiplier = false; 20885 if (Diff < 10) { 20886 switch ((unsigned char)Diff) { 20887 default: break; 20888 case 1: // result = add base, cond 20889 case 2: // result = lea base( , cond*2) 20890 case 3: // result = lea base(cond, cond*2) 20891 case 4: // result = lea base( , cond*4) 20892 case 5: // result = lea base(cond, cond*4) 20893 case 8: // result = lea base( , cond*8) 20894 case 9: // result = lea base(cond, cond*8) 20895 isFastMultiplier = true; 20896 break; 20897 } 20898 } 20899 20900 if (isFastMultiplier) { 20901 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 20902 if (NeedsCondInvert) // Invert the condition if needed. 20903 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 20904 DAG.getConstant(1, Cond.getValueType())); 20905 20906 // Zero extend the condition if needed. 20907 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 20908 Cond); 20909 // Scale the condition by the difference. 20910 if (Diff != 1) 20911 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 20912 DAG.getConstant(Diff, Cond.getValueType())); 20913 20914 // Add the base if non-zero. 20915 if (FalseC->getAPIntValue() != 0) 20916 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 20917 SDValue(FalseC, 0)); 20918 return Cond; 20919 } 20920 } 20921 } 20922 } 20923 20924 // Canonicalize max and min: 20925 // (x > y) ? x : y -> (x >= y) ? x : y 20926 // (x < y) ? x : y -> (x <= y) ? x : y 20927 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates 20928 // the need for an extra compare 20929 // against zero. e.g. 20930 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 20931 // subl %esi, %edi 20932 // testl %edi, %edi 20933 // movl $0, %eax 20934 // cmovgl %edi, %eax 20935 // => 20936 // xorl %eax, %eax 20937 // subl %esi, $edi 20938 // cmovsl %eax, %edi 20939 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && 20940 DAG.isEqualTo(LHS, Cond.getOperand(0)) && 20941 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 20942 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 20943 switch (CC) { 20944 default: break; 20945 case ISD::SETLT: 20946 case ISD::SETGT: { 20947 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; 20948 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), 20949 Cond.getOperand(0), Cond.getOperand(1), NewCC); 20950 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); 20951 } 20952 } 20953 } 20954 20955 // Early exit check 20956 if (!TLI.isTypeLegal(VT)) 20957 return SDValue(); 20958 20959 // Match VSELECTs into subs with unsigned saturation. 20960 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 20961 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. 20962 ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || 20963 (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { 20964 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 20965 20966 // Check if one of the arms of the VSELECT is a zero vector. If it's on the 20967 // left side invert the predicate to simplify logic below. 20968 SDValue Other; 20969 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 20970 Other = RHS; 20971 CC = ISD::getSetCCInverse(CC, true); 20972 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { 20973 Other = LHS; 20974 } 20975 20976 if (Other.getNode() && Other->getNumOperands() == 2 && 20977 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { 20978 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); 20979 SDValue CondRHS = Cond->getOperand(1); 20980 20981 // Look for a general sub with unsigned saturation first. 20982 // x >= y ? x-y : 0 --> subus x, y 20983 // x > y ? x-y : 0 --> subus x, y 20984 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && 20985 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) 20986 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); 20987 20988 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) 20989 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { 20990 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) 20991 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) 20992 // If the RHS is a constant we have to reverse the const 20993 // canonicalization. 20994 // x > C-1 ? x+-C : 0 --> subus x, C 20995 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && 20996 CondRHSConst->getAPIntValue() == 20997 (-OpRHSConst->getAPIntValue() - 1)) 20998 return DAG.getNode( 20999 X86ISD::SUBUS, DL, VT, OpLHS, 21000 DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); 21001 21002 // Another special case: If C was a sign bit, the sub has been 21003 // canonicalized into a xor. 21004 // FIXME: Would it be better to use computeKnownBits to determine 21005 // whether it's safe to decanonicalize the xor? 21006 // x s< 0 ? x^C : 0 --> subus x, C 21007 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && 21008 ISD::isBuildVectorAllZeros(CondRHS.getNode()) && 21009 OpRHSConst->getAPIntValue().isSignBit()) 21010 // Note that we have to rebuild the RHS constant here to ensure we 21011 // don't rely on particular values of undef lanes. 21012 return DAG.getNode( 21013 X86ISD::SUBUS, DL, VT, OpLHS, 21014 DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); 21015 } 21016 } 21017 } 21018 21019 // Try to match a min/max vector operation. 21020 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { 21021 std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); 21022 unsigned Opc = ret.first; 21023 bool NeedSplit = ret.second; 21024 21025 if (Opc && NeedSplit) { 21026 unsigned NumElems = VT.getVectorNumElements(); 21027 // Extract the LHS vectors 21028 SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); 21029 SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); 21030 21031 // Extract the RHS vectors 21032 SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); 21033 SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); 21034 21035 // Create min/max for each subvector 21036 LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); 21037 RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); 21038 21039 // Merge the result 21040 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); 21041 } else if (Opc) 21042 return DAG.getNode(Opc, DL, VT, LHS, RHS); 21043 } 21044 21045 // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. 21046 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && 21047 // Check if SETCC has already been promoted 21048 TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT && 21049 // Check that condition value type matches vselect operand type 21050 CondVT == VT) { 21051 21052 assert(Cond.getValueType().isVector() && 21053 "vector select expects a vector selector!"); 21054 21055 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); 21056 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); 21057 21058 if (!TValIsAllOnes && !FValIsAllZeros) { 21059 // Try invert the condition if true value is not all 1s and false value 21060 // is not all 0s. 21061 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); 21062 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); 21063 21064 if (TValIsAllZeros || FValIsAllOnes) { 21065 SDValue CC = Cond.getOperand(2); 21066 ISD::CondCode NewCC = 21067 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 21068 Cond.getOperand(0).getValueType().isInteger()); 21069 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); 21070 std::swap(LHS, RHS); 21071 TValIsAllOnes = FValIsAllOnes; 21072 FValIsAllZeros = TValIsAllZeros; 21073 } 21074 } 21075 21076 if (TValIsAllOnes || FValIsAllZeros) { 21077 SDValue Ret; 21078 21079 if (TValIsAllOnes && FValIsAllZeros) 21080 Ret = Cond; 21081 else if (TValIsAllOnes) 21082 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, 21083 DAG.getNode(ISD::BITCAST, DL, CondVT, RHS)); 21084 else if (FValIsAllZeros) 21085 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, 21086 DAG.getNode(ISD::BITCAST, DL, CondVT, LHS)); 21087 21088 return DAG.getNode(ISD::BITCAST, DL, VT, Ret); 21089 } 21090 } 21091 21092 // Try to fold this VSELECT into a MOVSS/MOVSD 21093 if (N->getOpcode() == ISD::VSELECT && 21094 Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { 21095 if (VT == MVT::v4i32 || VT == MVT::v4f32 || 21096 (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { 21097 bool CanFold = false; 21098 unsigned NumElems = Cond.getNumOperands(); 21099 SDValue A = LHS; 21100 SDValue B = RHS; 21101 21102 if (isZero(Cond.getOperand(0))) { 21103 CanFold = true; 21104 21105 // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) 21106 // fold (vselect <0,-1> -> (movsd A, B) 21107 for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) 21108 CanFold = isAllOnes(Cond.getOperand(i)); 21109 } else if (isAllOnes(Cond.getOperand(0))) { 21110 CanFold = true; 21111 std::swap(A, B); 21112 21113 // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) 21114 // fold (vselect <-1,0> -> (movsd B, A) 21115 for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) 21116 CanFold = isZero(Cond.getOperand(i)); 21117 } 21118 21119 if (CanFold) { 21120 if (VT == MVT::v4i32 || VT == MVT::v4f32) 21121 return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); 21122 return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); 21123 } 21124 21125 if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) { 21126 // fold (v4i32: vselect <0,0,-1,-1>, A, B) -> 21127 // (v4i32 (bitcast (movsd (v2i64 (bitcast A)), 21128 // (v2i64 (bitcast B))))) 21129 // 21130 // fold (v4f32: vselect <0,0,-1,-1>, A, B) -> 21131 // (v4f32 (bitcast (movsd (v2f64 (bitcast A)), 21132 // (v2f64 (bitcast B))))) 21133 // 21134 // fold (v4i32: vselect <-1,-1,0,0>, A, B) -> 21135 // (v4i32 (bitcast (movsd (v2i64 (bitcast B)), 21136 // (v2i64 (bitcast A))))) 21137 // 21138 // fold (v4f32: vselect <-1,-1,0,0>, A, B) -> 21139 // (v4f32 (bitcast (movsd (v2f64 (bitcast B)), 21140 // (v2f64 (bitcast A))))) 21141 21142 CanFold = (isZero(Cond.getOperand(0)) && 21143 isZero(Cond.getOperand(1)) && 21144 isAllOnes(Cond.getOperand(2)) && 21145 isAllOnes(Cond.getOperand(3))); 21146 21147 if (!CanFold && isAllOnes(Cond.getOperand(0)) && 21148 isAllOnes(Cond.getOperand(1)) && 21149 isZero(Cond.getOperand(2)) && 21150 isZero(Cond.getOperand(3))) { 21151 CanFold = true; 21152 std::swap(LHS, RHS); 21153 } 21154 21155 if (CanFold) { 21156 EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64; 21157 SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS); 21158 SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS); 21159 SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA, 21160 NewB, DAG); 21161 return DAG.getNode(ISD::BITCAST, DL, VT, Select); 21162 } 21163 } 21164 } 21165 } 21166 21167 // If we know that this node is legal then we know that it is going to be 21168 // matched by one of the SSE/AVX BLEND instructions. These instructions only 21169 // depend on the highest bit in each word. Try to use SimplifyDemandedBits 21170 // to simplify previous instructions. 21171 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && 21172 !DCI.isBeforeLegalize() && 21173 // We explicitly check against v8i16 and v16i16 because, although 21174 // they're marked as Custom, they might only be legal when Cond is a 21175 // build_vector of constants. This will be taken care in a later 21176 // condition. 21177 (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && 21178 VT != MVT::v8i16)) { 21179 unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); 21180 21181 // Don't optimize vector selects that map to mask-registers. 21182 if (BitWidth == 1) 21183 return SDValue(); 21184 21185 // Check all uses of that condition operand to check whether it will be 21186 // consumed by non-BLEND instructions, which may depend on all bits are set 21187 // properly. 21188 for (SDNode::use_iterator I = Cond->use_begin(), 21189 E = Cond->use_end(); I != E; ++I) 21190 if (I->getOpcode() != ISD::VSELECT) 21191 // TODO: Add other opcodes eventually lowered into BLEND. 21192 return SDValue(); 21193 21194 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); 21195 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); 21196 21197 APInt KnownZero, KnownOne; 21198 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), 21199 DCI.isBeforeLegalizeOps()); 21200 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || 21201 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) 21202 DCI.CommitTargetLoweringOpt(TLO); 21203 } 21204 21205 // We should generate an X86ISD::BLENDI from a vselect if its argument 21206 // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of 21207 // constants. This specific pattern gets generated when we split a 21208 // selector for a 512 bit vector in a machine without AVX512 (but with 21209 // 256-bit vectors), during legalization: 21210 // 21211 // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) 21212 // 21213 // Iff we find this pattern and the build_vectors are built from 21214 // constants, we translate the vselect into a shuffle_vector that we 21215 // know will be matched by LowerVECTOR_SHUFFLEtoBlend. 21216 if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { 21217 SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); 21218 if (Shuffle.getNode()) 21219 return Shuffle; 21220 } 21221 21222 return SDValue(); 21223 } 21224 21225 // Check whether a boolean test is testing a boolean value generated by 21226 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition 21227 // code. 21228 // 21229 // Simplify the following patterns: 21230 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or 21231 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) 21232 // to (Op EFLAGS Cond) 21233 // 21234 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or 21235 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) 21236 // to (Op EFLAGS !Cond) 21237 // 21238 // where Op could be BRCOND or CMOV. 21239 // 21240 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { 21241 // Quit if not CMP and SUB with its value result used. 21242 if (Cmp.getOpcode() != X86ISD::CMP && 21243 (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) 21244 return SDValue(); 21245 21246 // Quit if not used as a boolean value. 21247 if (CC != X86::COND_E && CC != X86::COND_NE) 21248 return SDValue(); 21249 21250 // Check CMP operands. One of them should be 0 or 1 and the other should be 21251 // an SetCC or extended from it. 21252 SDValue Op1 = Cmp.getOperand(0); 21253 SDValue Op2 = Cmp.getOperand(1); 21254 21255 SDValue SetCC; 21256 const ConstantSDNode* C = nullptr; 21257 bool needOppositeCond = (CC == X86::COND_E); 21258 bool checkAgainstTrue = false; // Is it a comparison against 1? 21259 21260 if ((C = dyn_cast<ConstantSDNode>(Op1))) 21261 SetCC = Op2; 21262 else if ((C = dyn_cast<ConstantSDNode>(Op2))) 21263 SetCC = Op1; 21264 else // Quit if all operands are not constants. 21265 return SDValue(); 21266 21267 if (C->getZExtValue() == 1) { 21268 needOppositeCond = !needOppositeCond; 21269 checkAgainstTrue = true; 21270 } else if (C->getZExtValue() != 0) 21271 // Quit if the constant is neither 0 or 1. 21272 return SDValue(); 21273 21274 bool truncatedToBoolWithAnd = false; 21275 // Skip (zext $x), (trunc $x), or (and $x, 1) node. 21276 while (SetCC.getOpcode() == ISD::ZERO_EXTEND || 21277 SetCC.getOpcode() == ISD::TRUNCATE || 21278 SetCC.getOpcode() == ISD::AND) { 21279 if (SetCC.getOpcode() == ISD::AND) { 21280 int OpIdx = -1; 21281 ConstantSDNode *CS; 21282 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && 21283 CS->getZExtValue() == 1) 21284 OpIdx = 1; 21285 if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && 21286 CS->getZExtValue() == 1) 21287 OpIdx = 0; 21288 if (OpIdx == -1) 21289 break; 21290 SetCC = SetCC.getOperand(OpIdx); 21291 truncatedToBoolWithAnd = true; 21292 } else 21293 SetCC = SetCC.getOperand(0); 21294 } 21295 21296 switch (SetCC.getOpcode()) { 21297 case X86ISD::SETCC_CARRY: 21298 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to 21299 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, 21300 // i.e. it's a comparison against true but the result of SETCC_CARRY is not 21301 // truncated to i1 using 'and'. 21302 if (checkAgainstTrue && !truncatedToBoolWithAnd) 21303 break; 21304 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && 21305 "Invalid use of SETCC_CARRY!"); 21306 // FALL THROUGH 21307 case X86ISD::SETCC: 21308 // Set the condition code or opposite one if necessary. 21309 CC = X86::CondCode(SetCC.getConstantOperandVal(0)); 21310 if (needOppositeCond) 21311 CC = X86::GetOppositeBranchCondition(CC); 21312 return SetCC.getOperand(1); 21313 case X86ISD::CMOV: { 21314 // Check whether false/true value has canonical one, i.e. 0 or 1. 21315 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); 21316 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); 21317 // Quit if true value is not a constant. 21318 if (!TVal) 21319 return SDValue(); 21320 // Quit if false value is not a constant. 21321 if (!FVal) { 21322 SDValue Op = SetCC.getOperand(0); 21323 // Skip 'zext' or 'trunc' node. 21324 if (Op.getOpcode() == ISD::ZERO_EXTEND || 21325 Op.getOpcode() == ISD::TRUNCATE) 21326 Op = Op.getOperand(0); 21327 // A special case for rdrand/rdseed, where 0 is set if false cond is 21328 // found. 21329 if ((Op.getOpcode() != X86ISD::RDRAND && 21330 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) 21331 return SDValue(); 21332 } 21333 // Quit if false value is not the constant 0 or 1. 21334 bool FValIsFalse = true; 21335 if (FVal && FVal->getZExtValue() != 0) { 21336 if (FVal->getZExtValue() != 1) 21337 return SDValue(); 21338 // If FVal is 1, opposite cond is needed. 21339 needOppositeCond = !needOppositeCond; 21340 FValIsFalse = false; 21341 } 21342 // Quit if TVal is not the constant opposite of FVal. 21343 if (FValIsFalse && TVal->getZExtValue() != 1) 21344 return SDValue(); 21345 if (!FValIsFalse && TVal->getZExtValue() != 0) 21346 return SDValue(); 21347 CC = X86::CondCode(SetCC.getConstantOperandVal(2)); 21348 if (needOppositeCond) 21349 CC = X86::GetOppositeBranchCondition(CC); 21350 return SetCC.getOperand(3); 21351 } 21352 } 21353 21354 return SDValue(); 21355 } 21356 21357 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 21358 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 21359 TargetLowering::DAGCombinerInfo &DCI, 21360 const X86Subtarget *Subtarget) { 21361 SDLoc DL(N); 21362 21363 // If the flag operand isn't dead, don't touch this CMOV. 21364 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 21365 return SDValue(); 21366 21367 SDValue FalseOp = N->getOperand(0); 21368 SDValue TrueOp = N->getOperand(1); 21369 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 21370 SDValue Cond = N->getOperand(3); 21371 21372 if (CC == X86::COND_E || CC == X86::COND_NE) { 21373 switch (Cond.getOpcode()) { 21374 default: break; 21375 case X86ISD::BSR: 21376 case X86ISD::BSF: 21377 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 21378 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 21379 return (CC == X86::COND_E) ? FalseOp : TrueOp; 21380 } 21381 } 21382 21383 SDValue Flags; 21384 21385 Flags = checkBoolTestSetCCCombine(Cond, CC); 21386 if (Flags.getNode() && 21387 // Extra check as FCMOV only supports a subset of X86 cond. 21388 (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { 21389 SDValue Ops[] = { FalseOp, TrueOp, 21390 DAG.getConstant(CC, MVT::i8), Flags }; 21391 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); 21392 } 21393 21394 // If this is a select between two integer constants, try to do some 21395 // optimizations. Note that the operands are ordered the opposite of SELECT 21396 // operands. 21397 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 21398 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 21399 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 21400 // larger than FalseC (the false value). 21401 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 21402 CC = X86::GetOppositeBranchCondition(CC); 21403 std::swap(TrueC, FalseC); 21404 std::swap(TrueOp, FalseOp); 21405 } 21406 21407 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 21408 // This is efficient for any integer data type (including i8/i16) and 21409 // shift amount. 21410 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 21411 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 21412 DAG.getConstant(CC, MVT::i8), Cond); 21413 21414 // Zero extend the condition if needed. 21415 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 21416 21417 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 21418 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 21419 DAG.getConstant(ShAmt, MVT::i8)); 21420 if (N->getNumValues() == 2) // Dead flag value? 21421 return DCI.CombineTo(N, Cond, SDValue()); 21422 return Cond; 21423 } 21424 21425 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 21426 // for any integer data type, including i8/i16. 21427 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 21428 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 21429 DAG.getConstant(CC, MVT::i8), Cond); 21430 21431 // Zero extend the condition if needed. 21432 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 21433 FalseC->getValueType(0), Cond); 21434 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 21435 SDValue(FalseC, 0)); 21436 21437 if (N->getNumValues() == 2) // Dead flag value? 21438 return DCI.CombineTo(N, Cond, SDValue()); 21439 return Cond; 21440 } 21441 21442 // Optimize cases that will turn into an LEA instruction. This requires 21443 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 21444 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 21445 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 21446 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 21447 21448 bool isFastMultiplier = false; 21449 if (Diff < 10) { 21450 switch ((unsigned char)Diff) { 21451 default: break; 21452 case 1: // result = add base, cond 21453 case 2: // result = lea base( , cond*2) 21454 case 3: // result = lea base(cond, cond*2) 21455 case 4: // result = lea base( , cond*4) 21456 case 5: // result = lea base(cond, cond*4) 21457 case 8: // result = lea base( , cond*8) 21458 case 9: // result = lea base(cond, cond*8) 21459 isFastMultiplier = true; 21460 break; 21461 } 21462 } 21463 21464 if (isFastMultiplier) { 21465 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 21466 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 21467 DAG.getConstant(CC, MVT::i8), Cond); 21468 // Zero extend the condition if needed. 21469 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 21470 Cond); 21471 // Scale the condition by the difference. 21472 if (Diff != 1) 21473 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 21474 DAG.getConstant(Diff, Cond.getValueType())); 21475 21476 // Add the base if non-zero. 21477 if (FalseC->getAPIntValue() != 0) 21478 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 21479 SDValue(FalseC, 0)); 21480 if (N->getNumValues() == 2) // Dead flag value? 21481 return DCI.CombineTo(N, Cond, SDValue()); 21482 return Cond; 21483 } 21484 } 21485 } 21486 } 21487 21488 // Handle these cases: 21489 // (select (x != c), e, c) -> select (x != c), e, x), 21490 // (select (x == c), c, e) -> select (x == c), x, e) 21491 // where the c is an integer constant, and the "select" is the combination 21492 // of CMOV and CMP. 21493 // 21494 // The rationale for this change is that the conditional-move from a constant 21495 // needs two instructions, however, conditional-move from a register needs 21496 // only one instruction. 21497 // 21498 // CAVEAT: By replacing a constant with a symbolic value, it may obscure 21499 // some instruction-combining opportunities. This opt needs to be 21500 // postponed as late as possible. 21501 // 21502 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { 21503 // the DCI.xxxx conditions are provided to postpone the optimization as 21504 // late as possible. 21505 21506 ConstantSDNode *CmpAgainst = nullptr; 21507 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && 21508 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && 21509 !isa<ConstantSDNode>(Cond.getOperand(0))) { 21510 21511 if (CC == X86::COND_NE && 21512 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { 21513 CC = X86::GetOppositeBranchCondition(CC); 21514 std::swap(TrueOp, FalseOp); 21515 } 21516 21517 if (CC == X86::COND_E && 21518 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { 21519 SDValue Ops[] = { FalseOp, Cond.getOperand(0), 21520 DAG.getConstant(CC, MVT::i8), Cond }; 21521 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); 21522 } 21523 } 21524 } 21525 21526 return SDValue(); 21527 } 21528 21529 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, 21530 const X86Subtarget *Subtarget) { 21531 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 21532 switch (IntNo) { 21533 default: return SDValue(); 21534 // SSE/AVX/AVX2 blend intrinsics. 21535 case Intrinsic::x86_avx2_pblendvb: 21536 case Intrinsic::x86_avx2_pblendw: 21537 case Intrinsic::x86_avx2_pblendd_128: 21538 case Intrinsic::x86_avx2_pblendd_256: 21539 // Don't try to simplify this intrinsic if we don't have AVX2. 21540 if (!Subtarget->hasAVX2()) 21541 return SDValue(); 21542 // FALL-THROUGH 21543 case Intrinsic::x86_avx_blend_pd_256: 21544 case Intrinsic::x86_avx_blend_ps_256: 21545 case Intrinsic::x86_avx_blendv_pd_256: 21546 case Intrinsic::x86_avx_blendv_ps_256: 21547 // Don't try to simplify this intrinsic if we don't have AVX. 21548 if (!Subtarget->hasAVX()) 21549 return SDValue(); 21550 // FALL-THROUGH 21551 case Intrinsic::x86_sse41_pblendw: 21552 case Intrinsic::x86_sse41_blendpd: 21553 case Intrinsic::x86_sse41_blendps: 21554 case Intrinsic::x86_sse41_blendvps: 21555 case Intrinsic::x86_sse41_blendvpd: 21556 case Intrinsic::x86_sse41_pblendvb: { 21557 SDValue Op0 = N->getOperand(1); 21558 SDValue Op1 = N->getOperand(2); 21559 SDValue Mask = N->getOperand(3); 21560 21561 // Don't try to simplify this intrinsic if we don't have SSE4.1. 21562 if (!Subtarget->hasSSE41()) 21563 return SDValue(); 21564 21565 // fold (blend A, A, Mask) -> A 21566 if (Op0 == Op1) 21567 return Op0; 21568 // fold (blend A, B, allZeros) -> A 21569 if (ISD::isBuildVectorAllZeros(Mask.getNode())) 21570 return Op0; 21571 // fold (blend A, B, allOnes) -> B 21572 if (ISD::isBuildVectorAllOnes(Mask.getNode())) 21573 return Op1; 21574 21575 // Simplify the case where the mask is a constant i32 value. 21576 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { 21577 if (C->isNullValue()) 21578 return Op0; 21579 if (C->isAllOnesValue()) 21580 return Op1; 21581 } 21582 21583 return SDValue(); 21584 } 21585 21586 // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. 21587 case Intrinsic::x86_sse2_psrai_w: 21588 case Intrinsic::x86_sse2_psrai_d: 21589 case Intrinsic::x86_avx2_psrai_w: 21590 case Intrinsic::x86_avx2_psrai_d: 21591 case Intrinsic::x86_sse2_psra_w: 21592 case Intrinsic::x86_sse2_psra_d: 21593 case Intrinsic::x86_avx2_psra_w: 21594 case Intrinsic::x86_avx2_psra_d: { 21595 SDValue Op0 = N->getOperand(1); 21596 SDValue Op1 = N->getOperand(2); 21597 EVT VT = Op0.getValueType(); 21598 assert(VT.isVector() && "Expected a vector type!"); 21599 21600 if (isa<BuildVectorSDNode>(Op1)) 21601 Op1 = Op1.getOperand(0); 21602 21603 if (!isa<ConstantSDNode>(Op1)) 21604 return SDValue(); 21605 21606 EVT SVT = VT.getVectorElementType(); 21607 unsigned SVTBits = SVT.getSizeInBits(); 21608 21609 ConstantSDNode *CND = cast<ConstantSDNode>(Op1); 21610 const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); 21611 uint64_t ShAmt = C.getZExtValue(); 21612 21613 // Don't try to convert this shift into a ISD::SRA if the shift 21614 // count is bigger than or equal to the element size. 21615 if (ShAmt >= SVTBits) 21616 return SDValue(); 21617 21618 // Trivial case: if the shift count is zero, then fold this 21619 // into the first operand. 21620 if (ShAmt == 0) 21621 return Op0; 21622 21623 // Replace this packed shift intrinsic with a target independent 21624 // shift dag node. 21625 SDValue Splat = DAG.getConstant(C, VT); 21626 return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); 21627 } 21628 } 21629 } 21630 21631 /// PerformMulCombine - Optimize a single multiply with constant into two 21632 /// in order to implement it with two cheaper instructions, e.g. 21633 /// LEA + SHL, LEA + LEA. 21634 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 21635 TargetLowering::DAGCombinerInfo &DCI) { 21636 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 21637 return SDValue(); 21638 21639 EVT VT = N->getValueType(0); 21640 if (VT != MVT::i64) 21641 return SDValue(); 21642 21643 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 21644 if (!C) 21645 return SDValue(); 21646 uint64_t MulAmt = C->getZExtValue(); 21647 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 21648 return SDValue(); 21649 21650 uint64_t MulAmt1 = 0; 21651 uint64_t MulAmt2 = 0; 21652 if ((MulAmt % 9) == 0) { 21653 MulAmt1 = 9; 21654 MulAmt2 = MulAmt / 9; 21655 } else if ((MulAmt % 5) == 0) { 21656 MulAmt1 = 5; 21657 MulAmt2 = MulAmt / 5; 21658 } else if ((MulAmt % 3) == 0) { 21659 MulAmt1 = 3; 21660 MulAmt2 = MulAmt / 3; 21661 } 21662 if (MulAmt2 && 21663 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 21664 SDLoc DL(N); 21665 21666 if (isPowerOf2_64(MulAmt2) && 21667 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 21668 // If second multiplifer is pow2, issue it first. We want the multiply by 21669 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 21670 // is an add. 21671 std::swap(MulAmt1, MulAmt2); 21672 21673 SDValue NewMul; 21674 if (isPowerOf2_64(MulAmt1)) 21675 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 21676 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 21677 else 21678 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 21679 DAG.getConstant(MulAmt1, VT)); 21680 21681 if (isPowerOf2_64(MulAmt2)) 21682 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 21683 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 21684 else 21685 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 21686 DAG.getConstant(MulAmt2, VT)); 21687 21688 // Do not add new nodes to DAG combiner worklist. 21689 DCI.CombineTo(N, NewMul, false); 21690 } 21691 return SDValue(); 21692 } 21693 21694 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 21695 SDValue N0 = N->getOperand(0); 21696 SDValue N1 = N->getOperand(1); 21697 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 21698 EVT VT = N0.getValueType(); 21699 21700 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 21701 // since the result of setcc_c is all zero's or all ones. 21702 if (VT.isInteger() && !VT.isVector() && 21703 N1C && N0.getOpcode() == ISD::AND && 21704 N0.getOperand(1).getOpcode() == ISD::Constant) { 21705 SDValue N00 = N0.getOperand(0); 21706 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 21707 ((N00.getOpcode() == ISD::ANY_EXTEND || 21708 N00.getOpcode() == ISD::ZERO_EXTEND) && 21709 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 21710 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 21711 APInt ShAmt = N1C->getAPIntValue(); 21712 Mask = Mask.shl(ShAmt); 21713 if (Mask != 0) 21714 return DAG.getNode(ISD::AND, SDLoc(N), VT, 21715 N00, DAG.getConstant(Mask, VT)); 21716 } 21717 } 21718 21719 // Hardware support for vector shifts is sparse which makes us scalarize the 21720 // vector operations in many cases. Also, on sandybridge ADD is faster than 21721 // shl. 21722 // (shl V, 1) -> add V,V 21723 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) 21724 if (auto *N1SplatC = N1BV->getConstantSplatNode()) { 21725 assert(N0.getValueType().isVector() && "Invalid vector shift type"); 21726 // We shift all of the values by one. In many cases we do not have 21727 // hardware support for this operation. This is better expressed as an ADD 21728 // of two values. 21729 if (N1SplatC->getZExtValue() == 1) 21730 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); 21731 } 21732 21733 return SDValue(); 21734 } 21735 21736 /// \brief Returns a vector of 0s if the node in input is a vector logical 21737 /// shift by a constant amount which is known to be bigger than or equal 21738 /// to the vector element size in bits. 21739 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, 21740 const X86Subtarget *Subtarget) { 21741 EVT VT = N->getValueType(0); 21742 21743 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && 21744 (!Subtarget->hasInt256() || 21745 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) 21746 return SDValue(); 21747 21748 SDValue Amt = N->getOperand(1); 21749 SDLoc DL(N); 21750 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) 21751 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { 21752 APInt ShiftAmt = AmtSplat->getAPIntValue(); 21753 unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); 21754 21755 // SSE2/AVX2 logical shifts always return a vector of 0s 21756 // if the shift amount is bigger than or equal to 21757 // the element size. The constant shift amount will be 21758 // encoded as a 8-bit immediate. 21759 if (ShiftAmt.trunc(8).uge(MaxAmount)) 21760 return getZeroVector(VT, Subtarget, DAG, DL); 21761 } 21762 21763 return SDValue(); 21764 } 21765 21766 /// PerformShiftCombine - Combine shifts. 21767 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 21768 TargetLowering::DAGCombinerInfo &DCI, 21769 const X86Subtarget *Subtarget) { 21770 if (N->getOpcode() == ISD::SHL) { 21771 SDValue V = PerformSHLCombine(N, DAG); 21772 if (V.getNode()) return V; 21773 } 21774 21775 if (N->getOpcode() != ISD::SRA) { 21776 // Try to fold this logical shift into a zero vector. 21777 SDValue V = performShiftToAllZeros(N, DAG, Subtarget); 21778 if (V.getNode()) return V; 21779 } 21780 21781 return SDValue(); 21782 } 21783 21784 // CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 21785 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS 21786 // and friends. Likewise for OR -> CMPNEQSS. 21787 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 21788 TargetLowering::DAGCombinerInfo &DCI, 21789 const X86Subtarget *Subtarget) { 21790 unsigned opcode; 21791 21792 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 21793 // we're requiring SSE2 for both. 21794 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 21795 SDValue N0 = N->getOperand(0); 21796 SDValue N1 = N->getOperand(1); 21797 SDValue CMP0 = N0->getOperand(1); 21798 SDValue CMP1 = N1->getOperand(1); 21799 SDLoc DL(N); 21800 21801 // The SETCCs should both refer to the same CMP. 21802 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 21803 return SDValue(); 21804 21805 SDValue CMP00 = CMP0->getOperand(0); 21806 SDValue CMP01 = CMP0->getOperand(1); 21807 EVT VT = CMP00.getValueType(); 21808 21809 if (VT == MVT::f32 || VT == MVT::f64) { 21810 bool ExpectingFlags = false; 21811 // Check for any users that want flags: 21812 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 21813 !ExpectingFlags && UI != UE; ++UI) 21814 switch (UI->getOpcode()) { 21815 default: 21816 case ISD::BR_CC: 21817 case ISD::BRCOND: 21818 case ISD::SELECT: 21819 ExpectingFlags = true; 21820 break; 21821 case ISD::CopyToReg: 21822 case ISD::SIGN_EXTEND: 21823 case ISD::ZERO_EXTEND: 21824 case ISD::ANY_EXTEND: 21825 break; 21826 } 21827 21828 if (!ExpectingFlags) { 21829 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 21830 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 21831 21832 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 21833 X86::CondCode tmp = cc0; 21834 cc0 = cc1; 21835 cc1 = tmp; 21836 } 21837 21838 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 21839 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 21840 // FIXME: need symbolic constants for these magic numbers. 21841 // See X86ATTInstPrinter.cpp:printSSECC(). 21842 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 21843 if (Subtarget->hasAVX512()) { 21844 SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, 21845 CMP01, DAG.getConstant(x86cc, MVT::i8)); 21846 if (N->getValueType(0) != MVT::i1) 21847 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), 21848 FSetCC); 21849 return FSetCC; 21850 } 21851 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, 21852 CMP00.getValueType(), CMP00, CMP01, 21853 DAG.getConstant(x86cc, MVT::i8)); 21854 21855 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 21856 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; 21857 21858 if (is64BitFP && !Subtarget->is64Bit()) { 21859 // On a 32-bit target, we cannot bitcast the 64-bit float to a 21860 // 64-bit integer, since that's not a legal type. Since 21861 // OnesOrZeroesF is all ones of all zeroes, we don't need all the 21862 // bits, but can do this little dance to extract the lowest 32 bits 21863 // and work with those going forward. 21864 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, 21865 OnesOrZeroesF); 21866 SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, 21867 Vector64); 21868 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 21869 Vector32, DAG.getIntPtrConstant(0)); 21870 IntVT = MVT::i32; 21871 } 21872 21873 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); 21874 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, 21875 DAG.getConstant(1, IntVT)); 21876 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 21877 return OneBitOfTruth; 21878 } 21879 } 21880 } 21881 } 21882 return SDValue(); 21883 } 21884 21885 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 21886 /// so it can be folded inside ANDNP. 21887 static bool CanFoldXORWithAllOnes(const SDNode *N) { 21888 EVT VT = N->getValueType(0); 21889 21890 // Match direct AllOnes for 128 and 256-bit vectors 21891 if (ISD::isBuildVectorAllOnes(N)) 21892 return true; 21893 21894 // Look through a bit convert. 21895 if (N->getOpcode() == ISD::BITCAST) 21896 N = N->getOperand(0).getNode(); 21897 21898 // Sometimes the operand may come from a insert_subvector building a 256-bit 21899 // allones vector 21900 if (VT.is256BitVector() && 21901 N->getOpcode() == ISD::INSERT_SUBVECTOR) { 21902 SDValue V1 = N->getOperand(0); 21903 SDValue V2 = N->getOperand(1); 21904 21905 if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && 21906 V1.getOperand(0).getOpcode() == ISD::UNDEF && 21907 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 21908 ISD::isBuildVectorAllOnes(V2.getNode())) 21909 return true; 21910 } 21911 21912 return false; 21913 } 21914 21915 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized 21916 // register. In most cases we actually compare or select YMM-sized registers 21917 // and mixing the two types creates horrible code. This method optimizes 21918 // some of the transition sequences. 21919 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, 21920 TargetLowering::DAGCombinerInfo &DCI, 21921 const X86Subtarget *Subtarget) { 21922 EVT VT = N->getValueType(0); 21923 if (!VT.is256BitVector()) 21924 return SDValue(); 21925 21926 assert((N->getOpcode() == ISD::ANY_EXTEND || 21927 N->getOpcode() == ISD::ZERO_EXTEND || 21928 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); 21929 21930 SDValue Narrow = N->getOperand(0); 21931 EVT NarrowVT = Narrow->getValueType(0); 21932 if (!NarrowVT.is128BitVector()) 21933 return SDValue(); 21934 21935 if (Narrow->getOpcode() != ISD::XOR && 21936 Narrow->getOpcode() != ISD::AND && 21937 Narrow->getOpcode() != ISD::OR) 21938 return SDValue(); 21939 21940 SDValue N0 = Narrow->getOperand(0); 21941 SDValue N1 = Narrow->getOperand(1); 21942 SDLoc DL(Narrow); 21943 21944 // The Left side has to be a trunc. 21945 if (N0.getOpcode() != ISD::TRUNCATE) 21946 return SDValue(); 21947 21948 // The type of the truncated inputs. 21949 EVT WideVT = N0->getOperand(0)->getValueType(0); 21950 if (WideVT != VT) 21951 return SDValue(); 21952 21953 // The right side has to be a 'trunc' or a constant vector. 21954 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; 21955 ConstantSDNode *RHSConstSplat = nullptr; 21956 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) 21957 RHSConstSplat = RHSBV->getConstantSplatNode(); 21958 if (!RHSTrunc && !RHSConstSplat) 21959 return SDValue(); 21960 21961 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 21962 21963 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) 21964 return SDValue(); 21965 21966 // Set N0 and N1 to hold the inputs to the new wide operation. 21967 N0 = N0->getOperand(0); 21968 if (RHSConstSplat) { 21969 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), 21970 SDValue(RHSConstSplat, 0)); 21971 SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); 21972 N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); 21973 } else if (RHSTrunc) { 21974 N1 = N1->getOperand(0); 21975 } 21976 21977 // Generate the wide operation. 21978 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); 21979 unsigned Opcode = N->getOpcode(); 21980 switch (Opcode) { 21981 case ISD::ANY_EXTEND: 21982 return Op; 21983 case ISD::ZERO_EXTEND: { 21984 unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); 21985 APInt Mask = APInt::getAllOnesValue(InBits); 21986 Mask = Mask.zext(VT.getScalarType().getSizeInBits()); 21987 return DAG.getNode(ISD::AND, DL, VT, 21988 Op, DAG.getConstant(Mask, VT)); 21989 } 21990 case ISD::SIGN_EXTEND: 21991 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, 21992 Op, DAG.getValueType(NarrowVT)); 21993 default: 21994 llvm_unreachable("Unexpected opcode"); 21995 } 21996 } 21997 21998 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 21999 TargetLowering::DAGCombinerInfo &DCI, 22000 const X86Subtarget *Subtarget) { 22001 EVT VT = N->getValueType(0); 22002 if (DCI.isBeforeLegalizeOps()) 22003 return SDValue(); 22004 22005 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 22006 if (R.getNode()) 22007 return R; 22008 22009 // Create BEXTR instructions 22010 // BEXTR is ((X >> imm) & (2**size-1)) 22011 if (VT == MVT::i32 || VT == MVT::i64) { 22012 SDValue N0 = N->getOperand(0); 22013 SDValue N1 = N->getOperand(1); 22014 SDLoc DL(N); 22015 22016 // Check for BEXTR. 22017 if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && 22018 (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { 22019 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); 22020 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 22021 if (MaskNode && ShiftNode) { 22022 uint64_t Mask = MaskNode->getZExtValue(); 22023 uint64_t Shift = ShiftNode->getZExtValue(); 22024 if (isMask_64(Mask)) { 22025 uint64_t MaskSize = CountPopulation_64(Mask); 22026 if (Shift + MaskSize <= VT.getSizeInBits()) 22027 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), 22028 DAG.getConstant(Shift | (MaskSize << 8), VT)); 22029 } 22030 } 22031 } // BEXTR 22032 22033 return SDValue(); 22034 } 22035 22036 // Want to form ANDNP nodes: 22037 // 1) In the hopes of then easily combining them with OR and AND nodes 22038 // to form PBLEND/PSIGN. 22039 // 2) To match ANDN packed intrinsics 22040 if (VT != MVT::v2i64 && VT != MVT::v4i64) 22041 return SDValue(); 22042 22043 SDValue N0 = N->getOperand(0); 22044 SDValue N1 = N->getOperand(1); 22045 SDLoc DL(N); 22046 22047 // Check LHS for vnot 22048 if (N0.getOpcode() == ISD::XOR && 22049 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 22050 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 22051 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 22052 22053 // Check RHS for vnot 22054 if (N1.getOpcode() == ISD::XOR && 22055 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 22056 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 22057 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 22058 22059 return SDValue(); 22060 } 22061 22062 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 22063 TargetLowering::DAGCombinerInfo &DCI, 22064 const X86Subtarget *Subtarget) { 22065 if (DCI.isBeforeLegalizeOps()) 22066 return SDValue(); 22067 22068 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 22069 if (R.getNode()) 22070 return R; 22071 22072 SDValue N0 = N->getOperand(0); 22073 SDValue N1 = N->getOperand(1); 22074 EVT VT = N->getValueType(0); 22075 22076 // look for psign/blend 22077 if (VT == MVT::v2i64 || VT == MVT::v4i64) { 22078 if (!Subtarget->hasSSSE3() || 22079 (VT == MVT::v4i64 && !Subtarget->hasInt256())) 22080 return SDValue(); 22081 22082 // Canonicalize pandn to RHS 22083 if (N0.getOpcode() == X86ISD::ANDNP) 22084 std::swap(N0, N1); 22085 // or (and (m, y), (pandn m, x)) 22086 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 22087 SDValue Mask = N1.getOperand(0); 22088 SDValue X = N1.getOperand(1); 22089 SDValue Y; 22090 if (N0.getOperand(0) == Mask) 22091 Y = N0.getOperand(1); 22092 if (N0.getOperand(1) == Mask) 22093 Y = N0.getOperand(0); 22094 22095 // Check to see if the mask appeared in both the AND and ANDNP and 22096 if (!Y.getNode()) 22097 return SDValue(); 22098 22099 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 22100 // Look through mask bitcast. 22101 if (Mask.getOpcode() == ISD::BITCAST) 22102 Mask = Mask.getOperand(0); 22103 if (X.getOpcode() == ISD::BITCAST) 22104 X = X.getOperand(0); 22105 if (Y.getOpcode() == ISD::BITCAST) 22106 Y = Y.getOperand(0); 22107 22108 EVT MaskVT = Mask.getValueType(); 22109 22110 // Validate that the Mask operand is a vector sra node. 22111 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 22112 // there is no psrai.b 22113 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 22114 unsigned SraAmt = ~0; 22115 if (Mask.getOpcode() == ISD::SRA) { 22116 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) 22117 if (auto *AmtConst = AmtBV->getConstantSplatNode()) 22118 SraAmt = AmtConst->getZExtValue(); 22119 } else if (Mask.getOpcode() == X86ISD::VSRAI) { 22120 SDValue SraC = Mask.getOperand(1); 22121 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 22122 } 22123 if ((SraAmt + 1) != EltBits) 22124 return SDValue(); 22125 22126 SDLoc DL(N); 22127 22128 // Now we know we at least have a plendvb with the mask val. See if 22129 // we can form a psignb/w/d. 22130 // psign = x.type == y.type == mask.type && y = sub(0, x); 22131 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 22132 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 22133 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { 22134 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && 22135 "Unsupported VT for PSIGN"); 22136 Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); 22137 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 22138 } 22139 // PBLENDVB only available on SSE 4.1 22140 if (!Subtarget->hasSSE41()) 22141 return SDValue(); 22142 22143 EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; 22144 22145 X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); 22146 Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); 22147 Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); 22148 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); 22149 return DAG.getNode(ISD::BITCAST, DL, VT, Mask); 22150 } 22151 } 22152 22153 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 22154 return SDValue(); 22155 22156 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 22157 MachineFunction &MF = DAG.getMachineFunction(); 22158 bool OptForSize = MF.getFunction()->getAttributes(). 22159 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); 22160 22161 // SHLD/SHRD instructions have lower register pressure, but on some 22162 // platforms they have higher latency than the equivalent 22163 // series of shifts/or that would otherwise be generated. 22164 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions 22165 // have higher latencies and we are not optimizing for size. 22166 if (!OptForSize && Subtarget->isSHLDSlow()) 22167 return SDValue(); 22168 22169 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 22170 std::swap(N0, N1); 22171 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 22172 return SDValue(); 22173 if (!N0.hasOneUse() || !N1.hasOneUse()) 22174 return SDValue(); 22175 22176 SDValue ShAmt0 = N0.getOperand(1); 22177 if (ShAmt0.getValueType() != MVT::i8) 22178 return SDValue(); 22179 SDValue ShAmt1 = N1.getOperand(1); 22180 if (ShAmt1.getValueType() != MVT::i8) 22181 return SDValue(); 22182 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 22183 ShAmt0 = ShAmt0.getOperand(0); 22184 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 22185 ShAmt1 = ShAmt1.getOperand(0); 22186 22187 SDLoc DL(N); 22188 unsigned Opc = X86ISD::SHLD; 22189 SDValue Op0 = N0.getOperand(0); 22190 SDValue Op1 = N1.getOperand(0); 22191 if (ShAmt0.getOpcode() == ISD::SUB) { 22192 Opc = X86ISD::SHRD; 22193 std::swap(Op0, Op1); 22194 std::swap(ShAmt0, ShAmt1); 22195 } 22196 22197 unsigned Bits = VT.getSizeInBits(); 22198 if (ShAmt1.getOpcode() == ISD::SUB) { 22199 SDValue Sum = ShAmt1.getOperand(0); 22200 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 22201 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 22202 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 22203 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 22204 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 22205 return DAG.getNode(Opc, DL, VT, 22206 Op0, Op1, 22207 DAG.getNode(ISD::TRUNCATE, DL, 22208 MVT::i8, ShAmt0)); 22209 } 22210 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 22211 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 22212 if (ShAmt0C && 22213 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 22214 return DAG.getNode(Opc, DL, VT, 22215 N0.getOperand(0), N1.getOperand(0), 22216 DAG.getNode(ISD::TRUNCATE, DL, 22217 MVT::i8, ShAmt0)); 22218 } 22219 22220 return SDValue(); 22221 } 22222 22223 // Generate NEG and CMOV for integer abs. 22224 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 22225 EVT VT = N->getValueType(0); 22226 22227 // Since X86 does not have CMOV for 8-bit integer, we don't convert 22228 // 8-bit integer abs to NEG and CMOV. 22229 if (VT.isInteger() && VT.getSizeInBits() == 8) 22230 return SDValue(); 22231 22232 SDValue N0 = N->getOperand(0); 22233 SDValue N1 = N->getOperand(1); 22234 SDLoc DL(N); 22235 22236 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 22237 // and change it to SUB and CMOV. 22238 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 22239 N0.getOpcode() == ISD::ADD && 22240 N0.getOperand(1) == N1 && 22241 N1.getOpcode() == ISD::SRA && 22242 N1.getOperand(0) == N0.getOperand(0)) 22243 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 22244 if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { 22245 // Generate SUB & CMOV. 22246 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), 22247 DAG.getConstant(0, VT), N0.getOperand(0)); 22248 22249 SDValue Ops[] = { N0.getOperand(0), Neg, 22250 DAG.getConstant(X86::COND_GE, MVT::i8), 22251 SDValue(Neg.getNode(), 1) }; 22252 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); 22253 } 22254 return SDValue(); 22255 } 22256 22257 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes 22258 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, 22259 TargetLowering::DAGCombinerInfo &DCI, 22260 const X86Subtarget *Subtarget) { 22261 if (DCI.isBeforeLegalizeOps()) 22262 return SDValue(); 22263 22264 if (Subtarget->hasCMov()) { 22265 SDValue RV = performIntegerAbsCombine(N, DAG); 22266 if (RV.getNode()) 22267 return RV; 22268 } 22269 22270 return SDValue(); 22271 } 22272 22273 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. 22274 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, 22275 TargetLowering::DAGCombinerInfo &DCI, 22276 const X86Subtarget *Subtarget) { 22277 LoadSDNode *Ld = cast<LoadSDNode>(N); 22278 EVT RegVT = Ld->getValueType(0); 22279 EVT MemVT = Ld->getMemoryVT(); 22280 SDLoc dl(Ld); 22281 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22282 22283 // On Sandybridge unaligned 256bit loads are inefficient. 22284 ISD::LoadExtType Ext = Ld->getExtensionType(); 22285 unsigned Alignment = Ld->getAlignment(); 22286 bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; 22287 if (RegVT.is256BitVector() && !Subtarget->hasInt256() && 22288 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { 22289 unsigned NumElems = RegVT.getVectorNumElements(); 22290 if (NumElems < 2) 22291 return SDValue(); 22292 22293 SDValue Ptr = Ld->getBasePtr(); 22294 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); 22295 22296 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 22297 NumElems/2); 22298 SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 22299 Ld->getPointerInfo(), Ld->isVolatile(), 22300 Ld->isNonTemporal(), Ld->isInvariant(), 22301 Alignment); 22302 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 22303 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, 22304 Ld->getPointerInfo(), Ld->isVolatile(), 22305 Ld->isNonTemporal(), Ld->isInvariant(), 22306 std::min(16U, Alignment)); 22307 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 22308 Load1.getValue(1), 22309 Load2.getValue(1)); 22310 22311 SDValue NewVec = DAG.getUNDEF(RegVT); 22312 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); 22313 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); 22314 return DCI.CombineTo(N, NewVec, TF, true); 22315 } 22316 22317 return SDValue(); 22318 } 22319 22320 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 22321 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 22322 const X86Subtarget *Subtarget) { 22323 StoreSDNode *St = cast<StoreSDNode>(N); 22324 EVT VT = St->getValue().getValueType(); 22325 EVT StVT = St->getMemoryVT(); 22326 SDLoc dl(St); 22327 SDValue StoredVal = St->getOperand(1); 22328 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22329 22330 // If we are saving a concatenation of two XMM registers, perform two stores. 22331 // On Sandy Bridge, 256-bit memory operations are executed by two 22332 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit 22333 // memory operation. 22334 unsigned Alignment = St->getAlignment(); 22335 bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; 22336 if (VT.is256BitVector() && !Subtarget->hasInt256() && 22337 StVT == VT && !IsAligned) { 22338 unsigned NumElems = VT.getVectorNumElements(); 22339 if (NumElems < 2) 22340 return SDValue(); 22341 22342 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); 22343 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); 22344 22345 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); 22346 SDValue Ptr0 = St->getBasePtr(); 22347 SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); 22348 22349 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, 22350 St->getPointerInfo(), St->isVolatile(), 22351 St->isNonTemporal(), Alignment); 22352 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, 22353 St->getPointerInfo(), St->isVolatile(), 22354 St->isNonTemporal(), 22355 std::min(16U, Alignment)); 22356 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); 22357 } 22358 22359 // Optimize trunc store (of multiple scalars) to shuffle and store. 22360 // First, pack all of the elements in one place. Next, store to memory 22361 // in fewer chunks. 22362 if (St->isTruncatingStore() && VT.isVector()) { 22363 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22364 unsigned NumElems = VT.getVectorNumElements(); 22365 assert(StVT != VT && "Cannot truncate to the same type"); 22366 unsigned FromSz = VT.getVectorElementType().getSizeInBits(); 22367 unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); 22368 22369 // From, To sizes and ElemCount must be pow of two 22370 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); 22371 // We are going to use the original vector elt for storing. 22372 // Accumulated smaller vector elements must be a multiple of the store size. 22373 if (0 != (NumElems * FromSz) % ToSz) return SDValue(); 22374 22375 unsigned SizeRatio = FromSz / ToSz; 22376 22377 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); 22378 22379 // Create a type on which we perform the shuffle 22380 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), 22381 StVT.getScalarType(), NumElems*SizeRatio); 22382 22383 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 22384 22385 SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); 22386 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 22387 for (unsigned i = 0; i != NumElems; ++i) 22388 ShuffleVec[i] = i * SizeRatio; 22389 22390 // Can't shuffle using an illegal type. 22391 if (!TLI.isTypeLegal(WideVecVT)) 22392 return SDValue(); 22393 22394 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, 22395 DAG.getUNDEF(WideVecVT), 22396 &ShuffleVec[0]); 22397 // At this point all of the data is stored at the bottom of the 22398 // register. We now need to save it to mem. 22399 22400 // Find the largest store unit 22401 MVT StoreType = MVT::i8; 22402 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; 22403 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { 22404 MVT Tp = (MVT::SimpleValueType)tp; 22405 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) 22406 StoreType = Tp; 22407 } 22408 22409 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. 22410 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && 22411 (64 <= NumElems * ToSz)) 22412 StoreType = MVT::f64; 22413 22414 // Bitcast the original vector into a vector of store-size units 22415 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), 22416 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); 22417 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 22418 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); 22419 SmallVector<SDValue, 8> Chains; 22420 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, 22421 TLI.getPointerTy()); 22422 SDValue Ptr = St->getBasePtr(); 22423 22424 // Perform one or more big stores into memory. 22425 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { 22426 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 22427 StoreType, ShuffWide, 22428 DAG.getIntPtrConstant(i)); 22429 SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, 22430 St->getPointerInfo(), St->isVolatile(), 22431 St->isNonTemporal(), St->getAlignment()); 22432 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 22433 Chains.push_back(Ch); 22434 } 22435 22436 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); 22437 } 22438 22439 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 22440 // the FP state in cases where an emms may be missing. 22441 // A preferable solution to the general problem is to figure out the right 22442 // places to insert EMMS. This qualifies as a quick hack. 22443 22444 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 22445 if (VT.getSizeInBits() != 64) 22446 return SDValue(); 22447 22448 const Function *F = DAG.getMachineFunction().getFunction(); 22449 bool NoImplicitFloatOps = F->getAttributes(). 22450 hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); 22451 bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps 22452 && Subtarget->hasSSE2(); 22453 if ((VT.isVector() || 22454 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 22455 isa<LoadSDNode>(St->getValue()) && 22456 !cast<LoadSDNode>(St->getValue())->isVolatile() && 22457 St->getChain().hasOneUse() && !St->isVolatile()) { 22458 SDNode* LdVal = St->getValue().getNode(); 22459 LoadSDNode *Ld = nullptr; 22460 int TokenFactorIndex = -1; 22461 SmallVector<SDValue, 8> Ops; 22462 SDNode* ChainVal = St->getChain().getNode(); 22463 // Must be a store of a load. We currently handle two cases: the load 22464 // is a direct child, and it's under an intervening TokenFactor. It is 22465 // possible to dig deeper under nested TokenFactors. 22466 if (ChainVal == LdVal) 22467 Ld = cast<LoadSDNode>(St->getChain()); 22468 else if (St->getValue().hasOneUse() && 22469 ChainVal->getOpcode() == ISD::TokenFactor) { 22470 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { 22471 if (ChainVal->getOperand(i).getNode() == LdVal) { 22472 TokenFactorIndex = i; 22473 Ld = cast<LoadSDNode>(St->getValue()); 22474 } else 22475 Ops.push_back(ChainVal->getOperand(i)); 22476 } 22477 } 22478 22479 if (!Ld || !ISD::isNormalLoad(Ld)) 22480 return SDValue(); 22481 22482 // If this is not the MMX case, i.e. we are just turning i64 load/store 22483 // into f64 load/store, avoid the transformation if there are multiple 22484 // uses of the loaded value. 22485 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 22486 return SDValue(); 22487 22488 SDLoc LdDL(Ld); 22489 SDLoc StDL(N); 22490 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 22491 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 22492 // pair instead. 22493 if (Subtarget->is64Bit() || F64IsLegal) { 22494 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 22495 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 22496 Ld->getPointerInfo(), Ld->isVolatile(), 22497 Ld->isNonTemporal(), Ld->isInvariant(), 22498 Ld->getAlignment()); 22499 SDValue NewChain = NewLd.getValue(1); 22500 if (TokenFactorIndex != -1) { 22501 Ops.push_back(NewChain); 22502 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 22503 } 22504 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 22505 St->getPointerInfo(), 22506 St->isVolatile(), St->isNonTemporal(), 22507 St->getAlignment()); 22508 } 22509 22510 // Otherwise, lower to two pairs of 32-bit loads / stores. 22511 SDValue LoAddr = Ld->getBasePtr(); 22512 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 22513 DAG.getConstant(4, MVT::i32)); 22514 22515 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 22516 Ld->getPointerInfo(), 22517 Ld->isVolatile(), Ld->isNonTemporal(), 22518 Ld->isInvariant(), Ld->getAlignment()); 22519 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 22520 Ld->getPointerInfo().getWithOffset(4), 22521 Ld->isVolatile(), Ld->isNonTemporal(), 22522 Ld->isInvariant(), 22523 MinAlign(Ld->getAlignment(), 4)); 22524 22525 SDValue NewChain = LoLd.getValue(1); 22526 if (TokenFactorIndex != -1) { 22527 Ops.push_back(LoLd); 22528 Ops.push_back(HiLd); 22529 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); 22530 } 22531 22532 LoAddr = St->getBasePtr(); 22533 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 22534 DAG.getConstant(4, MVT::i32)); 22535 22536 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 22537 St->getPointerInfo(), 22538 St->isVolatile(), St->isNonTemporal(), 22539 St->getAlignment()); 22540 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 22541 St->getPointerInfo().getWithOffset(4), 22542 St->isVolatile(), 22543 St->isNonTemporal(), 22544 MinAlign(St->getAlignment(), 4)); 22545 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 22546 } 22547 return SDValue(); 22548 } 22549 22550 /// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" 22551 /// and return the operands for the horizontal operation in LHS and RHS. A 22552 /// horizontal operation performs the binary operation on successive elements 22553 /// of its first operand, then on successive elements of its second operand, 22554 /// returning the resulting values in a vector. For example, if 22555 /// A = < float a0, float a1, float a2, float a3 > 22556 /// and 22557 /// B = < float b0, float b1, float b2, float b3 > 22558 /// then the result of doing a horizontal operation on A and B is 22559 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. 22560 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form 22561 /// A horizontal-op B, for some already available A and B, and if so then LHS is 22562 /// set to A, RHS to B, and the routine returns 'true'. 22563 /// Note that the binary operation should have the property that if one of the 22564 /// operands is UNDEF then the result is UNDEF. 22565 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { 22566 // Look for the following pattern: if 22567 // A = < float a0, float a1, float a2, float a3 > 22568 // B = < float b0, float b1, float b2, float b3 > 22569 // and 22570 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> 22571 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> 22572 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > 22573 // which is A horizontal-op B. 22574 22575 // At least one of the operands should be a vector shuffle. 22576 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && 22577 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) 22578 return false; 22579 22580 MVT VT = LHS.getSimpleValueType(); 22581 22582 assert((VT.is128BitVector() || VT.is256BitVector()) && 22583 "Unsupported vector type for horizontal add/sub"); 22584 22585 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to 22586 // operate independently on 128-bit lanes. 22587 unsigned NumElts = VT.getVectorNumElements(); 22588 unsigned NumLanes = VT.getSizeInBits()/128; 22589 unsigned NumLaneElts = NumElts / NumLanes; 22590 assert((NumLaneElts % 2 == 0) && 22591 "Vector type should have an even number of elements in each lane"); 22592 unsigned HalfLaneElts = NumLaneElts/2; 22593 22594 // View LHS in the form 22595 // LHS = VECTOR_SHUFFLE A, B, LMask 22596 // If LHS is not a shuffle then pretend it is the shuffle 22597 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> 22598 // NOTE: in what follows a default initialized SDValue represents an UNDEF of 22599 // type VT. 22600 SDValue A, B; 22601 SmallVector<int, 16> LMask(NumElts); 22602 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 22603 if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) 22604 A = LHS.getOperand(0); 22605 if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) 22606 B = LHS.getOperand(1); 22607 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); 22608 std::copy(Mask.begin(), Mask.end(), LMask.begin()); 22609 } else { 22610 if (LHS.getOpcode() != ISD::UNDEF) 22611 A = LHS; 22612 for (unsigned i = 0; i != NumElts; ++i) 22613 LMask[i] = i; 22614 } 22615 22616 // Likewise, view RHS in the form 22617 // RHS = VECTOR_SHUFFLE C, D, RMask 22618 SDValue C, D; 22619 SmallVector<int, 16> RMask(NumElts); 22620 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { 22621 if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) 22622 C = RHS.getOperand(0); 22623 if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) 22624 D = RHS.getOperand(1); 22625 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); 22626 std::copy(Mask.begin(), Mask.end(), RMask.begin()); 22627 } else { 22628 if (RHS.getOpcode() != ISD::UNDEF) 22629 C = RHS; 22630 for (unsigned i = 0; i != NumElts; ++i) 22631 RMask[i] = i; 22632 } 22633 22634 // Check that the shuffles are both shuffling the same vectors. 22635 if (!(A == C && B == D) && !(A == D && B == C)) 22636 return false; 22637 22638 // If everything is UNDEF then bail out: it would be better to fold to UNDEF. 22639 if (!A.getNode() && !B.getNode()) 22640 return false; 22641 22642 // If A and B occur in reverse order in RHS, then "swap" them (which means 22643 // rewriting the mask). 22644 if (A != C) 22645 CommuteVectorShuffleMask(RMask, NumElts); 22646 22647 // At this point LHS and RHS are equivalent to 22648 // LHS = VECTOR_SHUFFLE A, B, LMask 22649 // RHS = VECTOR_SHUFFLE A, B, RMask 22650 // Check that the masks correspond to performing a horizontal operation. 22651 for (unsigned l = 0; l != NumElts; l += NumLaneElts) { 22652 for (unsigned i = 0; i != NumLaneElts; ++i) { 22653 int LIdx = LMask[i+l], RIdx = RMask[i+l]; 22654 22655 // Ignore any UNDEF components. 22656 if (LIdx < 0 || RIdx < 0 || 22657 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || 22658 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) 22659 continue; 22660 22661 // Check that successive elements are being operated on. If not, this is 22662 // not a horizontal operation. 22663 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs 22664 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; 22665 if (!(LIdx == Index && RIdx == Index + 1) && 22666 !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) 22667 return false; 22668 } 22669 } 22670 22671 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. 22672 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. 22673 return true; 22674 } 22675 22676 /// PerformFADDCombine - Do target-specific dag combines on floating point adds. 22677 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, 22678 const X86Subtarget *Subtarget) { 22679 EVT VT = N->getValueType(0); 22680 SDValue LHS = N->getOperand(0); 22681 SDValue RHS = N->getOperand(1); 22682 22683 // Try to synthesize horizontal adds from adds of shuffles. 22684 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 22685 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 22686 isHorizontalBinOp(LHS, RHS, true)) 22687 return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); 22688 return SDValue(); 22689 } 22690 22691 /// PerformFSUBCombine - Do target-specific dag combines on floating point subs. 22692 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, 22693 const X86Subtarget *Subtarget) { 22694 EVT VT = N->getValueType(0); 22695 SDValue LHS = N->getOperand(0); 22696 SDValue RHS = N->getOperand(1); 22697 22698 // Try to synthesize horizontal subs from subs of shuffles. 22699 if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || 22700 (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && 22701 isHorizontalBinOp(LHS, RHS, false)) 22702 return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); 22703 return SDValue(); 22704 } 22705 22706 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 22707 /// X86ISD::FXOR nodes. 22708 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 22709 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 22710 // F[X]OR(0.0, x) -> x 22711 // F[X]OR(x, 0.0) -> x 22712 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 22713 if (C->getValueAPF().isPosZero()) 22714 return N->getOperand(1); 22715 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 22716 if (C->getValueAPF().isPosZero()) 22717 return N->getOperand(0); 22718 return SDValue(); 22719 } 22720 22721 /// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and 22722 /// X86ISD::FMAX nodes. 22723 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { 22724 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); 22725 22726 // Only perform optimizations if UnsafeMath is used. 22727 if (!DAG.getTarget().Options.UnsafeFPMath) 22728 return SDValue(); 22729 22730 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes 22731 // into FMINC and FMAXC, which are Commutative operations. 22732 unsigned NewOp = 0; 22733 switch (N->getOpcode()) { 22734 default: llvm_unreachable("unknown opcode"); 22735 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; 22736 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; 22737 } 22738 22739 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), 22740 N->getOperand(0), N->getOperand(1)); 22741 } 22742 22743 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 22744 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 22745 // FAND(0.0, x) -> 0.0 22746 // FAND(x, 0.0) -> 0.0 22747 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 22748 if (C->getValueAPF().isPosZero()) 22749 return N->getOperand(0); 22750 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 22751 if (C->getValueAPF().isPosZero()) 22752 return N->getOperand(1); 22753 return SDValue(); 22754 } 22755 22756 /// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes 22757 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { 22758 // FANDN(x, 0.0) -> 0.0 22759 // FANDN(0.0, x) -> x 22760 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 22761 if (C->getValueAPF().isPosZero()) 22762 return N->getOperand(1); 22763 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 22764 if (C->getValueAPF().isPosZero()) 22765 return N->getOperand(1); 22766 return SDValue(); 22767 } 22768 22769 static SDValue PerformBTCombine(SDNode *N, 22770 SelectionDAG &DAG, 22771 TargetLowering::DAGCombinerInfo &DCI) { 22772 // BT ignores high bits in the bit index operand. 22773 SDValue Op1 = N->getOperand(1); 22774 if (Op1.hasOneUse()) { 22775 unsigned BitWidth = Op1.getValueSizeInBits(); 22776 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 22777 APInt KnownZero, KnownOne; 22778 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 22779 !DCI.isBeforeLegalizeOps()); 22780 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 22781 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 22782 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 22783 DCI.CommitTargetLoweringOpt(TLO); 22784 } 22785 return SDValue(); 22786 } 22787 22788 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 22789 SDValue Op = N->getOperand(0); 22790 if (Op.getOpcode() == ISD::BITCAST) 22791 Op = Op.getOperand(0); 22792 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 22793 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 22794 VT.getVectorElementType().getSizeInBits() == 22795 OpVT.getVectorElementType().getSizeInBits()) { 22796 return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 22797 } 22798 return SDValue(); 22799 } 22800 22801 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 22802 const X86Subtarget *Subtarget) { 22803 EVT VT = N->getValueType(0); 22804 if (!VT.isVector()) 22805 return SDValue(); 22806 22807 SDValue N0 = N->getOperand(0); 22808 SDValue N1 = N->getOperand(1); 22809 EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); 22810 SDLoc dl(N); 22811 22812 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the 22813 // both SSE and AVX2 since there is no sign-extended shift right 22814 // operation on a vector with 64-bit elements. 22815 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> 22816 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) 22817 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || 22818 N0.getOpcode() == ISD::SIGN_EXTEND)) { 22819 SDValue N00 = N0.getOperand(0); 22820 22821 // EXTLOAD has a better solution on AVX2, 22822 // it may be replaced with X86ISD::VSEXT node. 22823 if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) 22824 if (!ISD::isNormalLoad(N00.getNode())) 22825 return SDValue(); 22826 22827 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { 22828 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 22829 N00, N1); 22830 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); 22831 } 22832 } 22833 return SDValue(); 22834 } 22835 22836 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, 22837 TargetLowering::DAGCombinerInfo &DCI, 22838 const X86Subtarget *Subtarget) { 22839 if (!DCI.isBeforeLegalizeOps()) 22840 return SDValue(); 22841 22842 if (!Subtarget->hasFp256()) 22843 return SDValue(); 22844 22845 EVT VT = N->getValueType(0); 22846 if (VT.isVector() && VT.getSizeInBits() == 256) { 22847 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 22848 if (R.getNode()) 22849 return R; 22850 } 22851 22852 return SDValue(); 22853 } 22854 22855 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, 22856 const X86Subtarget* Subtarget) { 22857 SDLoc dl(N); 22858 EVT VT = N->getValueType(0); 22859 22860 // Let legalize expand this if it isn't a legal type yet. 22861 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 22862 return SDValue(); 22863 22864 EVT ScalarVT = VT.getScalarType(); 22865 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || 22866 (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) 22867 return SDValue(); 22868 22869 SDValue A = N->getOperand(0); 22870 SDValue B = N->getOperand(1); 22871 SDValue C = N->getOperand(2); 22872 22873 bool NegA = (A.getOpcode() == ISD::FNEG); 22874 bool NegB = (B.getOpcode() == ISD::FNEG); 22875 bool NegC = (C.getOpcode() == ISD::FNEG); 22876 22877 // Negative multiplication when NegA xor NegB 22878 bool NegMul = (NegA != NegB); 22879 if (NegA) 22880 A = A.getOperand(0); 22881 if (NegB) 22882 B = B.getOperand(0); 22883 if (NegC) 22884 C = C.getOperand(0); 22885 22886 unsigned Opcode; 22887 if (!NegMul) 22888 Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; 22889 else 22890 Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; 22891 22892 return DAG.getNode(Opcode, dl, VT, A, B, C); 22893 } 22894 22895 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, 22896 TargetLowering::DAGCombinerInfo &DCI, 22897 const X86Subtarget *Subtarget) { 22898 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 22899 // (and (i32 x86isd::setcc_carry), 1) 22900 // This eliminates the zext. This transformation is necessary because 22901 // ISD::SETCC is always legalized to i8. 22902 SDLoc dl(N); 22903 SDValue N0 = N->getOperand(0); 22904 EVT VT = N->getValueType(0); 22905 22906 if (N0.getOpcode() == ISD::AND && 22907 N0.hasOneUse() && 22908 N0.getOperand(0).hasOneUse()) { 22909 SDValue N00 = N0.getOperand(0); 22910 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 22911 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 22912 if (!C || C->getZExtValue() != 1) 22913 return SDValue(); 22914 return DAG.getNode(ISD::AND, dl, VT, 22915 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 22916 N00.getOperand(0), N00.getOperand(1)), 22917 DAG.getConstant(1, VT)); 22918 } 22919 } 22920 22921 if (N0.getOpcode() == ISD::TRUNCATE && 22922 N0.hasOneUse() && 22923 N0.getOperand(0).hasOneUse()) { 22924 SDValue N00 = N0.getOperand(0); 22925 if (N00.getOpcode() == X86ISD::SETCC_CARRY) { 22926 return DAG.getNode(ISD::AND, dl, VT, 22927 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 22928 N00.getOperand(0), N00.getOperand(1)), 22929 DAG.getConstant(1, VT)); 22930 } 22931 } 22932 if (VT.is256BitVector()) { 22933 SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); 22934 if (R.getNode()) 22935 return R; 22936 } 22937 22938 return SDValue(); 22939 } 22940 22941 // Optimize x == -y --> x+y == 0 22942 // x != -y --> x+y != 0 22943 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, 22944 const X86Subtarget* Subtarget) { 22945 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 22946 SDValue LHS = N->getOperand(0); 22947 SDValue RHS = N->getOperand(1); 22948 EVT VT = N->getValueType(0); 22949 SDLoc DL(N); 22950 22951 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) 22952 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) 22953 if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { 22954 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 22955 LHS.getValueType(), RHS, LHS.getOperand(1)); 22956 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 22957 addV, DAG.getConstant(0, addV.getValueType()), CC); 22958 } 22959 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) 22960 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) 22961 if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { 22962 SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), 22963 RHS.getValueType(), LHS, RHS.getOperand(1)); 22964 return DAG.getSetCC(SDLoc(N), N->getValueType(0), 22965 addV, DAG.getConstant(0, addV.getValueType()), CC); 22966 } 22967 22968 if (VT.getScalarType() == MVT::i1) { 22969 bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && 22970 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 22971 bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode()); 22972 if (!IsSEXT0 && !IsVZero0) 22973 return SDValue(); 22974 bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) && 22975 (RHS.getOperand(0).getValueType().getScalarType() == MVT::i1); 22976 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); 22977 22978 if (!IsSEXT1 && !IsVZero1) 22979 return SDValue(); 22980 22981 if (IsSEXT0 && IsVZero1) { 22982 assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); 22983 if (CC == ISD::SETEQ) 22984 return DAG.getNOT(DL, LHS.getOperand(0), VT); 22985 return LHS.getOperand(0); 22986 } 22987 if (IsSEXT1 && IsVZero0) { 22988 assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type"); 22989 if (CC == ISD::SETEQ) 22990 return DAG.getNOT(DL, RHS.getOperand(0), VT); 22991 return RHS.getOperand(0); 22992 } 22993 } 22994 22995 return SDValue(); 22996 } 22997 22998 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, 22999 const X86Subtarget *Subtarget) { 23000 SDLoc dl(N); 23001 MVT VT = N->getOperand(1)->getSimpleValueType(0); 23002 assert((VT == MVT::v4f32 || VT == MVT::v4i32) && 23003 "X86insertps is only defined for v4x32"); 23004 23005 SDValue Ld = N->getOperand(1); 23006 if (MayFoldLoad(Ld)) { 23007 // Extract the countS bits from the immediate so we can get the proper 23008 // address when narrowing the vector load to a specific element. 23009 // When the second source op is a memory address, interps doesn't use 23010 // countS and just gets an f32 from that address. 23011 unsigned DestIndex = 23012 cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; 23013 Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); 23014 } else 23015 return SDValue(); 23016 23017 // Create this as a scalar to vector to match the instruction pattern. 23018 SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); 23019 // countS bits are ignored when loading from memory on insertps, which 23020 // means we don't need to explicitly set them to 0. 23021 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), 23022 LoadScalarToVector, N->getOperand(2)); 23023 } 23024 23025 // Helper function of PerformSETCCCombine. It is to materialize "setb reg" 23026 // as "sbb reg,reg", since it can be extended without zext and produces 23027 // an all-ones bit which is more useful than 0/1 in some cases. 23028 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, 23029 MVT VT) { 23030 if (VT == MVT::i8) 23031 return DAG.getNode(ISD::AND, DL, VT, 23032 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 23033 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), 23034 DAG.getConstant(1, VT)); 23035 assert (VT == MVT::i1 && "Unexpected type for SECCC node"); 23036 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, 23037 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 23038 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS)); 23039 } 23040 23041 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 23042 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, 23043 TargetLowering::DAGCombinerInfo &DCI, 23044 const X86Subtarget *Subtarget) { 23045 SDLoc DL(N); 23046 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); 23047 SDValue EFLAGS = N->getOperand(1); 23048 23049 if (CC == X86::COND_A) { 23050 // Try to convert COND_A into COND_B in an attempt to facilitate 23051 // materializing "setb reg". 23052 // 23053 // Do not flip "e > c", where "c" is a constant, because Cmp instruction 23054 // cannot take an immediate as its first operand. 23055 // 23056 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 23057 EFLAGS.getValueType().isInteger() && 23058 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { 23059 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), 23060 EFLAGS.getNode()->getVTList(), 23061 EFLAGS.getOperand(1), EFLAGS.getOperand(0)); 23062 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); 23063 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); 23064 } 23065 } 23066 23067 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 23068 // a zext and produces an all-ones bit which is more useful than 0/1 in some 23069 // cases. 23070 if (CC == X86::COND_B) 23071 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); 23072 23073 SDValue Flags; 23074 23075 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 23076 if (Flags.getNode()) { 23077 SDValue Cond = DAG.getConstant(CC, MVT::i8); 23078 return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); 23079 } 23080 23081 return SDValue(); 23082 } 23083 23084 // Optimize branch condition evaluation. 23085 // 23086 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, 23087 TargetLowering::DAGCombinerInfo &DCI, 23088 const X86Subtarget *Subtarget) { 23089 SDLoc DL(N); 23090 SDValue Chain = N->getOperand(0); 23091 SDValue Dest = N->getOperand(1); 23092 SDValue EFLAGS = N->getOperand(3); 23093 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); 23094 23095 SDValue Flags; 23096 23097 Flags = checkBoolTestSetCCCombine(EFLAGS, CC); 23098 if (Flags.getNode()) { 23099 SDValue Cond = DAG.getConstant(CC, MVT::i8); 23100 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, 23101 Flags); 23102 } 23103 23104 return SDValue(); 23105 } 23106 23107 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 23108 SelectionDAG &DAG) { 23109 // Take advantage of vector comparisons producing 0 or -1 in each lane to 23110 // optimize away operation when it's from a constant. 23111 // 23112 // The general transformation is: 23113 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 23114 // AND(VECTOR_CMP(x,y), constant2) 23115 // constant2 = UNARYOP(constant) 23116 23117 // Early exit if this isn't a vector operation, the operand of the 23118 // unary operation isn't a bitwise AND, or if the sizes of the operations 23119 // aren't the same. 23120 EVT VT = N->getValueType(0); 23121 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 23122 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 23123 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 23124 return SDValue(); 23125 23126 // Now check that the other operand of the AND is a constant. We could 23127 // make the transformation for non-constant splats as well, but it's unclear 23128 // that would be a benefit as it would not eliminate any operations, just 23129 // perform one more step in scalar code before moving to the vector unit. 23130 if (BuildVectorSDNode *BV = 23131 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 23132 // Bail out if the vector isn't a constant. 23133 if (!BV->isConstant()) 23134 return SDValue(); 23135 23136 // Everything checks out. Build up the new and improved node. 23137 SDLoc DL(N); 23138 EVT IntVT = BV->getValueType(0); 23139 // Create a new constant of the appropriate type for the transformed 23140 // DAG. 23141 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 23142 // The AND node needs bitcasts to/from an integer vector type around it. 23143 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 23144 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 23145 N->getOperand(0)->getOperand(0), MaskConst); 23146 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 23147 return Res; 23148 } 23149 23150 return SDValue(); 23151 } 23152 23153 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 23154 const X86TargetLowering *XTLI) { 23155 // First try to optimize away the conversion entirely when it's 23156 // conditionally from a constant. Vectors only. 23157 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); 23158 if (Res != SDValue()) 23159 return Res; 23160 23161 // Now move on to more general possibilities. 23162 SDValue Op0 = N->getOperand(0); 23163 EVT InVT = Op0->getValueType(0); 23164 23165 // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) 23166 if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { 23167 SDLoc dl(N); 23168 MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; 23169 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); 23170 return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); 23171 } 23172 23173 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 23174 // a 32-bit target where SSE doesn't support i64->FP operations. 23175 if (Op0.getOpcode() == ISD::LOAD) { 23176 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 23177 EVT VT = Ld->getValueType(0); 23178 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 23179 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 23180 !XTLI->getSubtarget()->is64Bit() && 23181 VT == MVT::i64) { 23182 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 23183 Ld->getChain(), Op0, DAG); 23184 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 23185 return FILDChain; 23186 } 23187 } 23188 return SDValue(); 23189 } 23190 23191 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 23192 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 23193 X86TargetLowering::DAGCombinerInfo &DCI) { 23194 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 23195 // the result is either zero or one (depending on the input carry bit). 23196 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 23197 if (X86::isZeroNode(N->getOperand(0)) && 23198 X86::isZeroNode(N->getOperand(1)) && 23199 // We don't have a good way to replace an EFLAGS use, so only do this when 23200 // dead right now. 23201 SDValue(N, 1).use_empty()) { 23202 SDLoc DL(N); 23203 EVT VT = N->getValueType(0); 23204 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 23205 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 23206 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 23207 DAG.getConstant(X86::COND_B,MVT::i8), 23208 N->getOperand(2)), 23209 DAG.getConstant(1, VT)); 23210 return DCI.CombineTo(N, Res1, CarryOut); 23211 } 23212 23213 return SDValue(); 23214 } 23215 23216 // fold (add Y, (sete X, 0)) -> adc 0, Y 23217 // (add Y, (setne X, 0)) -> sbb -1, Y 23218 // (sub (sete X, 0), Y) -> sbb 0, Y 23219 // (sub (setne X, 0), Y) -> adc -1, Y 23220 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 23221 SDLoc DL(N); 23222 23223 // Look through ZExts. 23224 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 23225 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 23226 return SDValue(); 23227 23228 SDValue SetCC = Ext.getOperand(0); 23229 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 23230 return SDValue(); 23231 23232 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 23233 if (CC != X86::COND_E && CC != X86::COND_NE) 23234 return SDValue(); 23235 23236 SDValue Cmp = SetCC.getOperand(1); 23237 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 23238 !X86::isZeroNode(Cmp.getOperand(1)) || 23239 !Cmp.getOperand(0).getValueType().isInteger()) 23240 return SDValue(); 23241 23242 SDValue CmpOp0 = Cmp.getOperand(0); 23243 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 23244 DAG.getConstant(1, CmpOp0.getValueType())); 23245 23246 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 23247 if (CC == X86::COND_NE) 23248 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 23249 DL, OtherVal.getValueType(), OtherVal, 23250 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 23251 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 23252 DL, OtherVal.getValueType(), OtherVal, 23253 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 23254 } 23255 23256 /// PerformADDCombine - Do target-specific dag combines on integer adds. 23257 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, 23258 const X86Subtarget *Subtarget) { 23259 EVT VT = N->getValueType(0); 23260 SDValue Op0 = N->getOperand(0); 23261 SDValue Op1 = N->getOperand(1); 23262 23263 // Try to synthesize horizontal adds from adds of shuffles. 23264 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 23265 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 23266 isHorizontalBinOp(Op0, Op1, true)) 23267 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); 23268 23269 return OptimizeConditionalInDecrement(N, DAG); 23270 } 23271 23272 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, 23273 const X86Subtarget *Subtarget) { 23274 SDValue Op0 = N->getOperand(0); 23275 SDValue Op1 = N->getOperand(1); 23276 23277 // X86 can't encode an immediate LHS of a sub. See if we can push the 23278 // negation into a preceding instruction. 23279 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 23280 // If the RHS of the sub is a XOR with one use and a constant, invert the 23281 // immediate. Then add one to the LHS of the sub so we can turn 23282 // X-Y -> X+~Y+1, saving one register. 23283 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 23284 isa<ConstantSDNode>(Op1.getOperand(1))) { 23285 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); 23286 EVT VT = Op0.getValueType(); 23287 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, 23288 Op1.getOperand(0), 23289 DAG.getConstant(~XorC, VT)); 23290 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, 23291 DAG.getConstant(C->getAPIntValue()+1, VT)); 23292 } 23293 } 23294 23295 // Try to synthesize horizontal adds from adds of shuffles. 23296 EVT VT = N->getValueType(0); 23297 if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || 23298 (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && 23299 isHorizontalBinOp(Op0, Op1, true)) 23300 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); 23301 23302 return OptimizeConditionalInDecrement(N, DAG); 23303 } 23304 23305 /// performVZEXTCombine - Performs build vector combines 23306 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, 23307 TargetLowering::DAGCombinerInfo &DCI, 23308 const X86Subtarget *Subtarget) { 23309 // (vzext (bitcast (vzext (x)) -> (vzext x) 23310 SDValue In = N->getOperand(0); 23311 while (In.getOpcode() == ISD::BITCAST) 23312 In = In.getOperand(0); 23313 23314 if (In.getOpcode() != X86ISD::VZEXT) 23315 return SDValue(); 23316 23317 return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), 23318 In.getOperand(0)); 23319 } 23320 23321 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 23322 DAGCombinerInfo &DCI) const { 23323 SelectionDAG &DAG = DCI.DAG; 23324 switch (N->getOpcode()) { 23325 default: break; 23326 case ISD::EXTRACT_VECTOR_ELT: 23327 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); 23328 case ISD::VSELECT: 23329 case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); 23330 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); 23331 case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); 23332 case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); 23333 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 23334 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 23335 case ISD::SHL: 23336 case ISD::SRA: 23337 case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); 23338 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 23339 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 23340 case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); 23341 case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); 23342 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 23343 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 23344 case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); 23345 case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); 23346 case X86ISD::FXOR: 23347 case X86ISD::FOR: return PerformFORCombine(N, DAG); 23348 case X86ISD::FMIN: 23349 case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); 23350 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 23351 case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); 23352 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 23353 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 23354 case ISD::ANY_EXTEND: 23355 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); 23356 case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); 23357 case ISD::SIGN_EXTEND_INREG: 23358 return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); 23359 case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); 23360 case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); 23361 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); 23362 case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); 23363 case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); 23364 case X86ISD::SHUFP: // Handle all target specific shuffles 23365 case X86ISD::PALIGNR: 23366 case X86ISD::UNPCKH: 23367 case X86ISD::UNPCKL: 23368 case X86ISD::MOVHLPS: 23369 case X86ISD::MOVLHPS: 23370 case X86ISD::PSHUFB: 23371 case X86ISD::PSHUFD: 23372 case X86ISD::PSHUFHW: 23373 case X86ISD::PSHUFLW: 23374 case X86ISD::MOVSS: 23375 case X86ISD::MOVSD: 23376 case X86ISD::VPERMILP: 23377 case X86ISD::VPERM2X128: 23378 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); 23379 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); 23380 case ISD::INTRINSIC_WO_CHAIN: 23381 return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); 23382 case X86ISD::INSERTPS: 23383 return PerformINSERTPSCombine(N, DAG, Subtarget); 23384 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); 23385 } 23386 23387 return SDValue(); 23388 } 23389 23390 /// isTypeDesirableForOp - Return true if the target has native support for 23391 /// the specified value type and it is 'desirable' to use the type for the 23392 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 23393 /// instruction encodings are longer and some i16 instructions are slow. 23394 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 23395 if (!isTypeLegal(VT)) 23396 return false; 23397 if (VT != MVT::i16) 23398 return true; 23399 23400 switch (Opc) { 23401 default: 23402 return true; 23403 case ISD::LOAD: 23404 case ISD::SIGN_EXTEND: 23405 case ISD::ZERO_EXTEND: 23406 case ISD::ANY_EXTEND: 23407 case ISD::SHL: 23408 case ISD::SRL: 23409 case ISD::SUB: 23410 case ISD::ADD: 23411 case ISD::MUL: 23412 case ISD::AND: 23413 case ISD::OR: 23414 case ISD::XOR: 23415 return false; 23416 } 23417 } 23418 23419 /// IsDesirableToPromoteOp - This method query the target whether it is 23420 /// beneficial for dag combiner to promote the specified node. If true, it 23421 /// should return the desired promotion type by reference. 23422 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 23423 EVT VT = Op.getValueType(); 23424 if (VT != MVT::i16) 23425 return false; 23426 23427 bool Promote = false; 23428 bool Commute = false; 23429 switch (Op.getOpcode()) { 23430 default: break; 23431 case ISD::LOAD: { 23432 LoadSDNode *LD = cast<LoadSDNode>(Op); 23433 // If the non-extending load has a single use and it's not live out, then it 23434 // might be folded. 23435 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 23436 Op.hasOneUse()*/) { 23437 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 23438 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 23439 // The only case where we'd want to promote LOAD (rather then it being 23440 // promoted as an operand is when it's only use is liveout. 23441 if (UI->getOpcode() != ISD::CopyToReg) 23442 return false; 23443 } 23444 } 23445 Promote = true; 23446 break; 23447 } 23448 case ISD::SIGN_EXTEND: 23449 case ISD::ZERO_EXTEND: 23450 case ISD::ANY_EXTEND: 23451 Promote = true; 23452 break; 23453 case ISD::SHL: 23454 case ISD::SRL: { 23455 SDValue N0 = Op.getOperand(0); 23456 // Look out for (store (shl (load), x)). 23457 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 23458 return false; 23459 Promote = true; 23460 break; 23461 } 23462 case ISD::ADD: 23463 case ISD::MUL: 23464 case ISD::AND: 23465 case ISD::OR: 23466 case ISD::XOR: 23467 Commute = true; 23468 // fallthrough 23469 case ISD::SUB: { 23470 SDValue N0 = Op.getOperand(0); 23471 SDValue N1 = Op.getOperand(1); 23472 if (!Commute && MayFoldLoad(N1)) 23473 return false; 23474 // Avoid disabling potential load folding opportunities. 23475 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 23476 return false; 23477 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 23478 return false; 23479 Promote = true; 23480 } 23481 } 23482 23483 PVT = MVT::i32; 23484 return Promote; 23485 } 23486 23487 //===----------------------------------------------------------------------===// 23488 // X86 Inline Assembly Support 23489 //===----------------------------------------------------------------------===// 23490 23491 namespace { 23492 // Helper to match a string separated by whitespace. 23493 bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { 23494 s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. 23495 23496 for (unsigned i = 0, e = args.size(); i != e; ++i) { 23497 StringRef piece(*args[i]); 23498 if (!s.startswith(piece)) // Check if the piece matches. 23499 return false; 23500 23501 s = s.substr(piece.size()); 23502 StringRef::size_type pos = s.find_first_not_of(" \t"); 23503 if (pos == 0) // We matched a prefix. 23504 return false; 23505 23506 s = s.substr(pos); 23507 } 23508 23509 return s.empty(); 23510 } 23511 const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; 23512 } 23513 23514 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { 23515 23516 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { 23517 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && 23518 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && 23519 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { 23520 23521 if (AsmPieces.size() == 3) 23522 return true; 23523 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) 23524 return true; 23525 } 23526 } 23527 return false; 23528 } 23529 23530 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 23531 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 23532 23533 std::string AsmStr = IA->getAsmString(); 23534 23535 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 23536 if (!Ty || Ty->getBitWidth() % 16 != 0) 23537 return false; 23538 23539 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 23540 SmallVector<StringRef, 4> AsmPieces; 23541 SplitString(AsmStr, AsmPieces, ";\n"); 23542 23543 switch (AsmPieces.size()) { 23544 default: return false; 23545 case 1: 23546 // FIXME: this should verify that we are targeting a 486 or better. If not, 23547 // we will turn this bswap into something that will be lowered to logical 23548 // ops instead of emitting the bswap asm. For now, we don't support 486 or 23549 // lower so don't worry about this. 23550 // bswap $0 23551 if (matchAsm(AsmPieces[0], "bswap", "$0") || 23552 matchAsm(AsmPieces[0], "bswapl", "$0") || 23553 matchAsm(AsmPieces[0], "bswapq", "$0") || 23554 matchAsm(AsmPieces[0], "bswap", "${0:q}") || 23555 matchAsm(AsmPieces[0], "bswapl", "${0:q}") || 23556 matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { 23557 // No need to check constraints, nothing other than the equivalent of 23558 // "=r,0" would be valid here. 23559 return IntrinsicLowering::LowerToByteSwap(CI); 23560 } 23561 23562 // rorw $$8, ${0:w} --> llvm.bswap.i16 23563 if (CI->getType()->isIntegerTy(16) && 23564 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 23565 (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || 23566 matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { 23567 AsmPieces.clear(); 23568 const std::string &ConstraintsStr = IA->getConstraintString(); 23569 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 23570 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 23571 if (clobbersFlagRegisters(AsmPieces)) 23572 return IntrinsicLowering::LowerToByteSwap(CI); 23573 } 23574 break; 23575 case 3: 23576 if (CI->getType()->isIntegerTy(32) && 23577 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && 23578 matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && 23579 matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && 23580 matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { 23581 AsmPieces.clear(); 23582 const std::string &ConstraintsStr = IA->getConstraintString(); 23583 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 23584 array_pod_sort(AsmPieces.begin(), AsmPieces.end()); 23585 if (clobbersFlagRegisters(AsmPieces)) 23586 return IntrinsicLowering::LowerToByteSwap(CI); 23587 } 23588 23589 if (CI->getType()->isIntegerTy(64)) { 23590 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 23591 if (Constraints.size() >= 2 && 23592 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 23593 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 23594 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 23595 if (matchAsm(AsmPieces[0], "bswap", "%eax") && 23596 matchAsm(AsmPieces[1], "bswap", "%edx") && 23597 matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) 23598 return IntrinsicLowering::LowerToByteSwap(CI); 23599 } 23600 } 23601 break; 23602 } 23603 return false; 23604 } 23605 23606 /// getConstraintType - Given a constraint letter, return the type of 23607 /// constraint it is for this target. 23608 X86TargetLowering::ConstraintType 23609 X86TargetLowering::getConstraintType(const std::string &Constraint) const { 23610 if (Constraint.size() == 1) { 23611 switch (Constraint[0]) { 23612 case 'R': 23613 case 'q': 23614 case 'Q': 23615 case 'f': 23616 case 't': 23617 case 'u': 23618 case 'y': 23619 case 'x': 23620 case 'Y': 23621 case 'l': 23622 return C_RegisterClass; 23623 case 'a': 23624 case 'b': 23625 case 'c': 23626 case 'd': 23627 case 'S': 23628 case 'D': 23629 case 'A': 23630 return C_Register; 23631 case 'I': 23632 case 'J': 23633 case 'K': 23634 case 'L': 23635 case 'M': 23636 case 'N': 23637 case 'G': 23638 case 'C': 23639 case 'e': 23640 case 'Z': 23641 return C_Other; 23642 default: 23643 break; 23644 } 23645 } 23646 return TargetLowering::getConstraintType(Constraint); 23647 } 23648 23649 /// Examine constraint type and operand type and determine a weight value. 23650 /// This object must already have been set up with the operand type 23651 /// and the current alternative constraint selected. 23652 TargetLowering::ConstraintWeight 23653 X86TargetLowering::getSingleConstraintMatchWeight( 23654 AsmOperandInfo &info, const char *constraint) const { 23655 ConstraintWeight weight = CW_Invalid; 23656 Value *CallOperandVal = info.CallOperandVal; 23657 // If we don't have a value, we can't do a match, 23658 // but allow it at the lowest weight. 23659 if (!CallOperandVal) 23660 return CW_Default; 23661 Type *type = CallOperandVal->getType(); 23662 // Look at the constraint type. 23663 switch (*constraint) { 23664 default: 23665 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 23666 case 'R': 23667 case 'q': 23668 case 'Q': 23669 case 'a': 23670 case 'b': 23671 case 'c': 23672 case 'd': 23673 case 'S': 23674 case 'D': 23675 case 'A': 23676 if (CallOperandVal->getType()->isIntegerTy()) 23677 weight = CW_SpecificReg; 23678 break; 23679 case 'f': 23680 case 't': 23681 case 'u': 23682 if (type->isFloatingPointTy()) 23683 weight = CW_SpecificReg; 23684 break; 23685 case 'y': 23686 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 23687 weight = CW_SpecificReg; 23688 break; 23689 case 'x': 23690 case 'Y': 23691 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || 23692 ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) 23693 weight = CW_Register; 23694 break; 23695 case 'I': 23696 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 23697 if (C->getZExtValue() <= 31) 23698 weight = CW_Constant; 23699 } 23700 break; 23701 case 'J': 23702 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23703 if (C->getZExtValue() <= 63) 23704 weight = CW_Constant; 23705 } 23706 break; 23707 case 'K': 23708 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23709 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 23710 weight = CW_Constant; 23711 } 23712 break; 23713 case 'L': 23714 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23715 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 23716 weight = CW_Constant; 23717 } 23718 break; 23719 case 'M': 23720 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23721 if (C->getZExtValue() <= 3) 23722 weight = CW_Constant; 23723 } 23724 break; 23725 case 'N': 23726 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23727 if (C->getZExtValue() <= 0xff) 23728 weight = CW_Constant; 23729 } 23730 break; 23731 case 'G': 23732 case 'C': 23733 if (dyn_cast<ConstantFP>(CallOperandVal)) { 23734 weight = CW_Constant; 23735 } 23736 break; 23737 case 'e': 23738 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23739 if ((C->getSExtValue() >= -0x80000000LL) && 23740 (C->getSExtValue() <= 0x7fffffffLL)) 23741 weight = CW_Constant; 23742 } 23743 break; 23744 case 'Z': 23745 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 23746 if (C->getZExtValue() <= 0xffffffff) 23747 weight = CW_Constant; 23748 } 23749 break; 23750 } 23751 return weight; 23752 } 23753 23754 /// LowerXConstraint - try to replace an X constraint, which matches anything, 23755 /// with another that has more specific requirements based on the type of the 23756 /// corresponding operand. 23757 const char *X86TargetLowering:: 23758 LowerXConstraint(EVT ConstraintVT) const { 23759 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 23760 // 'f' like normal targets. 23761 if (ConstraintVT.isFloatingPoint()) { 23762 if (Subtarget->hasSSE2()) 23763 return "Y"; 23764 if (Subtarget->hasSSE1()) 23765 return "x"; 23766 } 23767 23768 return TargetLowering::LowerXConstraint(ConstraintVT); 23769 } 23770 23771 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 23772 /// vector. If it is invalid, don't add anything to Ops. 23773 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 23774 std::string &Constraint, 23775 std::vector<SDValue>&Ops, 23776 SelectionDAG &DAG) const { 23777 SDValue Result; 23778 23779 // Only support length 1 constraints for now. 23780 if (Constraint.length() > 1) return; 23781 23782 char ConstraintLetter = Constraint[0]; 23783 switch (ConstraintLetter) { 23784 default: break; 23785 case 'I': 23786 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 23787 if (C->getZExtValue() <= 31) { 23788 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 23789 break; 23790 } 23791 } 23792 return; 23793 case 'J': 23794 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 23795 if (C->getZExtValue() <= 63) { 23796 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 23797 break; 23798 } 23799 } 23800 return; 23801 case 'K': 23802 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 23803 if (isInt<8>(C->getSExtValue())) { 23804 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 23805 break; 23806 } 23807 } 23808 return; 23809 case 'N': 23810 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 23811 if (C->getZExtValue() <= 255) { 23812 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 23813 break; 23814 } 23815 } 23816 return; 23817 case 'e': { 23818 // 32-bit signed value 23819 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 23820 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 23821 C->getSExtValue())) { 23822 // Widen to 64 bits here to get it sign extended. 23823 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 23824 break; 23825 } 23826 // FIXME gcc accepts some relocatable values here too, but only in certain 23827 // memory models; it's complicated. 23828 } 23829 return; 23830 } 23831 case 'Z': { 23832 // 32-bit unsigned value 23833 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 23834 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 23835 C->getZExtValue())) { 23836 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 23837 break; 23838 } 23839 } 23840 // FIXME gcc accepts some relocatable values here too, but only in certain 23841 // memory models; it's complicated. 23842 return; 23843 } 23844 case 'i': { 23845 // Literal immediates are always ok. 23846 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 23847 // Widen to 64 bits here to get it sign extended. 23848 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 23849 break; 23850 } 23851 23852 // In any sort of PIC mode addresses need to be computed at runtime by 23853 // adding in a register or some sort of table lookup. These can't 23854 // be used as immediates. 23855 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 23856 return; 23857 23858 // If we are in non-pic codegen mode, we allow the address of a global (with 23859 // an optional displacement) to be used with 'i'. 23860 GlobalAddressSDNode *GA = nullptr; 23861 int64_t Offset = 0; 23862 23863 // Match either (GA), (GA+C), (GA+C1+C2), etc. 23864 while (1) { 23865 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 23866 Offset += GA->getOffset(); 23867 break; 23868 } else if (Op.getOpcode() == ISD::ADD) { 23869 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 23870 Offset += C->getZExtValue(); 23871 Op = Op.getOperand(0); 23872 continue; 23873 } 23874 } else if (Op.getOpcode() == ISD::SUB) { 23875 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 23876 Offset += -C->getZExtValue(); 23877 Op = Op.getOperand(0); 23878 continue; 23879 } 23880 } 23881 23882 // Otherwise, this isn't something we can handle, reject it. 23883 return; 23884 } 23885 23886 const GlobalValue *GV = GA->getGlobal(); 23887 // If we require an extra load to get this address, as in PIC mode, we 23888 // can't accept it. 23889 if (isGlobalStubReference( 23890 Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) 23891 return; 23892 23893 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), 23894 GA->getValueType(0), Offset); 23895 break; 23896 } 23897 } 23898 23899 if (Result.getNode()) { 23900 Ops.push_back(Result); 23901 return; 23902 } 23903 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 23904 } 23905 23906 std::pair<unsigned, const TargetRegisterClass*> 23907 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 23908 MVT VT) const { 23909 // First, see if this is a constraint that directly corresponds to an LLVM 23910 // register class. 23911 if (Constraint.size() == 1) { 23912 // GCC Constraint Letters 23913 switch (Constraint[0]) { 23914 default: break; 23915 // TODO: Slight differences here in allocation order and leaving 23916 // RIP in the class. Do they matter any more here than they do 23917 // in the normal allocation? 23918 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 23919 if (Subtarget->is64Bit()) { 23920 if (VT == MVT::i32 || VT == MVT::f32) 23921 return std::make_pair(0U, &X86::GR32RegClass); 23922 if (VT == MVT::i16) 23923 return std::make_pair(0U, &X86::GR16RegClass); 23924 if (VT == MVT::i8 || VT == MVT::i1) 23925 return std::make_pair(0U, &X86::GR8RegClass); 23926 if (VT == MVT::i64 || VT == MVT::f64) 23927 return std::make_pair(0U, &X86::GR64RegClass); 23928 break; 23929 } 23930 // 32-bit fallthrough 23931 case 'Q': // Q_REGS 23932 if (VT == MVT::i32 || VT == MVT::f32) 23933 return std::make_pair(0U, &X86::GR32_ABCDRegClass); 23934 if (VT == MVT::i16) 23935 return std::make_pair(0U, &X86::GR16_ABCDRegClass); 23936 if (VT == MVT::i8 || VT == MVT::i1) 23937 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); 23938 if (VT == MVT::i64) 23939 return std::make_pair(0U, &X86::GR64_ABCDRegClass); 23940 break; 23941 case 'r': // GENERAL_REGS 23942 case 'l': // INDEX_REGS 23943 if (VT == MVT::i8 || VT == MVT::i1) 23944 return std::make_pair(0U, &X86::GR8RegClass); 23945 if (VT == MVT::i16) 23946 return std::make_pair(0U, &X86::GR16RegClass); 23947 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 23948 return std::make_pair(0U, &X86::GR32RegClass); 23949 return std::make_pair(0U, &X86::GR64RegClass); 23950 case 'R': // LEGACY_REGS 23951 if (VT == MVT::i8 || VT == MVT::i1) 23952 return std::make_pair(0U, &X86::GR8_NOREXRegClass); 23953 if (VT == MVT::i16) 23954 return std::make_pair(0U, &X86::GR16_NOREXRegClass); 23955 if (VT == MVT::i32 || !Subtarget->is64Bit()) 23956 return std::make_pair(0U, &X86::GR32_NOREXRegClass); 23957 return std::make_pair(0U, &X86::GR64_NOREXRegClass); 23958 case 'f': // FP Stack registers. 23959 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 23960 // value to the correct fpstack register class. 23961 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 23962 return std::make_pair(0U, &X86::RFP32RegClass); 23963 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 23964 return std::make_pair(0U, &X86::RFP64RegClass); 23965 return std::make_pair(0U, &X86::RFP80RegClass); 23966 case 'y': // MMX_REGS if MMX allowed. 23967 if (!Subtarget->hasMMX()) break; 23968 return std::make_pair(0U, &X86::VR64RegClass); 23969 case 'Y': // SSE_REGS if SSE2 allowed 23970 if (!Subtarget->hasSSE2()) break; 23971 // FALL THROUGH. 23972 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed 23973 if (!Subtarget->hasSSE1()) break; 23974 23975 switch (VT.SimpleTy) { 23976 default: break; 23977 // Scalar SSE types. 23978 case MVT::f32: 23979 case MVT::i32: 23980 return std::make_pair(0U, &X86::FR32RegClass); 23981 case MVT::f64: 23982 case MVT::i64: 23983 return std::make_pair(0U, &X86::FR64RegClass); 23984 // Vector types. 23985 case MVT::v16i8: 23986 case MVT::v8i16: 23987 case MVT::v4i32: 23988 case MVT::v2i64: 23989 case MVT::v4f32: 23990 case MVT::v2f64: 23991 return std::make_pair(0U, &X86::VR128RegClass); 23992 // AVX types. 23993 case MVT::v32i8: 23994 case MVT::v16i16: 23995 case MVT::v8i32: 23996 case MVT::v4i64: 23997 case MVT::v8f32: 23998 case MVT::v4f64: 23999 return std::make_pair(0U, &X86::VR256RegClass); 24000 case MVT::v8f64: 24001 case MVT::v16f32: 24002 case MVT::v16i32: 24003 case MVT::v8i64: 24004 return std::make_pair(0U, &X86::VR512RegClass); 24005 } 24006 break; 24007 } 24008 } 24009 24010 // Use the default implementation in TargetLowering to convert the register 24011 // constraint into a member of a register class. 24012 std::pair<unsigned, const TargetRegisterClass*> Res; 24013 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 24014 24015 // Not found as a standard register? 24016 if (!Res.second) { 24017 // Map st(0) -> st(7) -> ST0 24018 if (Constraint.size() == 7 && Constraint[0] == '{' && 24019 tolower(Constraint[1]) == 's' && 24020 tolower(Constraint[2]) == 't' && 24021 Constraint[3] == '(' && 24022 (Constraint[4] >= '0' && Constraint[4] <= '7') && 24023 Constraint[5] == ')' && 24024 Constraint[6] == '}') { 24025 24026 Res.first = X86::FP0+Constraint[4]-'0'; 24027 Res.second = &X86::RFP80RegClass; 24028 return Res; 24029 } 24030 24031 // GCC allows "st(0)" to be called just plain "st". 24032 if (StringRef("{st}").equals_lower(Constraint)) { 24033 Res.first = X86::FP0; 24034 Res.second = &X86::RFP80RegClass; 24035 return Res; 24036 } 24037 24038 // flags -> EFLAGS 24039 if (StringRef("{flags}").equals_lower(Constraint)) { 24040 Res.first = X86::EFLAGS; 24041 Res.second = &X86::CCRRegClass; 24042 return Res; 24043 } 24044 24045 // 'A' means EAX + EDX. 24046 if (Constraint == "A") { 24047 Res.first = X86::EAX; 24048 Res.second = &X86::GR32_ADRegClass; 24049 return Res; 24050 } 24051 return Res; 24052 } 24053 24054 // Otherwise, check to see if this is a register class of the wrong value 24055 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 24056 // turn into {ax},{dx}. 24057 if (Res.second->hasType(VT)) 24058 return Res; // Correct type already, nothing to do. 24059 24060 // All of the single-register GCC register classes map their values onto 24061 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 24062 // really want an 8-bit or 32-bit register, map to the appropriate register 24063 // class and return the appropriate register. 24064 if (Res.second == &X86::GR16RegClass) { 24065 if (VT == MVT::i8 || VT == MVT::i1) { 24066 unsigned DestReg = 0; 24067 switch (Res.first) { 24068 default: break; 24069 case X86::AX: DestReg = X86::AL; break; 24070 case X86::DX: DestReg = X86::DL; break; 24071 case X86::CX: DestReg = X86::CL; break; 24072 case X86::BX: DestReg = X86::BL; break; 24073 } 24074 if (DestReg) { 24075 Res.first = DestReg; 24076 Res.second = &X86::GR8RegClass; 24077 } 24078 } else if (VT == MVT::i32 || VT == MVT::f32) { 24079 unsigned DestReg = 0; 24080 switch (Res.first) { 24081 default: break; 24082 case X86::AX: DestReg = X86::EAX; break; 24083 case X86::DX: DestReg = X86::EDX; break; 24084 case X86::CX: DestReg = X86::ECX; break; 24085 case X86::BX: DestReg = X86::EBX; break; 24086 case X86::SI: DestReg = X86::ESI; break; 24087 case X86::DI: DestReg = X86::EDI; break; 24088 case X86::BP: DestReg = X86::EBP; break; 24089 case X86::SP: DestReg = X86::ESP; break; 24090 } 24091 if (DestReg) { 24092 Res.first = DestReg; 24093 Res.second = &X86::GR32RegClass; 24094 } 24095 } else if (VT == MVT::i64 || VT == MVT::f64) { 24096 unsigned DestReg = 0; 24097 switch (Res.first) { 24098 default: break; 24099 case X86::AX: DestReg = X86::RAX; break; 24100 case X86::DX: DestReg = X86::RDX; break; 24101 case X86::CX: DestReg = X86::RCX; break; 24102 case X86::BX: DestReg = X86::RBX; break; 24103 case X86::SI: DestReg = X86::RSI; break; 24104 case X86::DI: DestReg = X86::RDI; break; 24105 case X86::BP: DestReg = X86::RBP; break; 24106 case X86::SP: DestReg = X86::RSP; break; 24107 } 24108 if (DestReg) { 24109 Res.first = DestReg; 24110 Res.second = &X86::GR64RegClass; 24111 } 24112 } 24113 } else if (Res.second == &X86::FR32RegClass || 24114 Res.second == &X86::FR64RegClass || 24115 Res.second == &X86::VR128RegClass || 24116 Res.second == &X86::VR256RegClass || 24117 Res.second == &X86::FR32XRegClass || 24118 Res.second == &X86::FR64XRegClass || 24119 Res.second == &X86::VR128XRegClass || 24120 Res.second == &X86::VR256XRegClass || 24121 Res.second == &X86::VR512RegClass) { 24122 // Handle references to XMM physical registers that got mapped into the 24123 // wrong class. This can happen with constraints like {xmm0} where the 24124 // target independent register mapper will just pick the first match it can 24125 // find, ignoring the required type. 24126 24127 if (VT == MVT::f32 || VT == MVT::i32) 24128 Res.second = &X86::FR32RegClass; 24129 else if (VT == MVT::f64 || VT == MVT::i64) 24130 Res.second = &X86::FR64RegClass; 24131 else if (X86::VR128RegClass.hasType(VT)) 24132 Res.second = &X86::VR128RegClass; 24133 else if (X86::VR256RegClass.hasType(VT)) 24134 Res.second = &X86::VR256RegClass; 24135 else if (X86::VR512RegClass.hasType(VT)) 24136 Res.second = &X86::VR512RegClass; 24137 } 24138 24139 return Res; 24140 } 24141 24142 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, 24143 Type *Ty) const { 24144 // Scaling factors are not free at all. 24145 // An indexed folded instruction, i.e., inst (reg1, reg2, scale), 24146 // will take 2 allocations in the out of order engine instead of 1 24147 // for plain addressing mode, i.e. inst (reg1). 24148 // E.g., 24149 // vaddps (%rsi,%drx), %ymm0, %ymm1 24150 // Requires two allocations (one for the load, one for the computation) 24151 // whereas: 24152 // vaddps (%rsi), %ymm0, %ymm1 24153 // Requires just 1 allocation, i.e., freeing allocations for other operations 24154 // and having less micro operations to execute. 24155 // 24156 // For some X86 architectures, this is even worse because for instance for 24157 // stores, the complex addressing mode forces the instruction to use the 24158 // "load" ports instead of the dedicated "store" port. 24159 // E.g., on Haswell: 24160 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. 24161 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. 24162 if (isLegalAddressingMode(AM, Ty)) 24163 // Scale represents reg2 * scale, thus account for 1 24164 // as soon as we use a second register. 24165 return AM.Scale != 0; 24166 return -1; 24167 } 24168 24169 bool X86TargetLowering::isTargetFTOL() const { 24170 return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); 24171 }