LLVM API Documentation

AArch64ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements the AArch64TargetLowering class.
00011 //
00012 //===----------------------------------------------------------------------===//
00013 
00014 #include "AArch64ISelLowering.h"
00015 #include "AArch64MachineFunctionInfo.h"
00016 #include "AArch64PerfectShuffle.h"
00017 #include "AArch64Subtarget.h"
00018 #include "AArch64TargetMachine.h"
00019 #include "AArch64TargetObjectFile.h"
00020 #include "MCTargetDesc/AArch64AddressingModes.h"
00021 #include "llvm/ADT/Statistic.h"
00022 #include "llvm/CodeGen/CallingConvLower.h"
00023 #include "llvm/CodeGen/MachineFrameInfo.h"
00024 #include "llvm/CodeGen/MachineInstrBuilder.h"
00025 #include "llvm/CodeGen/MachineRegisterInfo.h"
00026 #include "llvm/IR/Function.h"
00027 #include "llvm/IR/Intrinsics.h"
00028 #include "llvm/IR/Type.h"
00029 #include "llvm/Support/CommandLine.h"
00030 #include "llvm/Support/Debug.h"
00031 #include "llvm/Support/ErrorHandling.h"
00032 #include "llvm/Support/raw_ostream.h"
00033 #include "llvm/Target/TargetOptions.h"
00034 using namespace llvm;
00035 
00036 #define DEBUG_TYPE "aarch64-lower"
00037 
00038 STATISTIC(NumTailCalls, "Number of tail calls");
00039 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
00040 
00041 namespace {
00042 enum AlignMode {
00043   StrictAlign,
00044   NoStrictAlign
00045 };
00046 }
00047 
00048 static cl::opt<AlignMode>
00049 Align(cl::desc("Load/store alignment support"),
00050       cl::Hidden, cl::init(NoStrictAlign),
00051       cl::values(
00052           clEnumValN(StrictAlign,   "aarch64-strict-align",
00053                      "Disallow all unaligned memory accesses"),
00054           clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
00055                      "Allow unaligned memory accesses"),
00056           clEnumValEnd));
00057 
00058 // Place holder until extr generation is tested fully.
00059 static cl::opt<bool>
00060 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
00061                           cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
00062                           cl::init(true));
00063 
00064 static cl::opt<bool>
00065 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
00066                          cl::desc("Allow AArch64 SLI/SRI formation"),
00067                          cl::init(false));
00068 
00069 //===----------------------------------------------------------------------===//
00070 // AArch64 Lowering public interface.
00071 //===----------------------------------------------------------------------===//
00072 static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
00073   if (TT.isOSBinFormatMachO())
00074     return new AArch64_MachoTargetObjectFile();
00075 
00076   return new AArch64_ELFTargetObjectFile();
00077 }
00078 
00079 AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
00080     : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
00081   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
00082 
00083   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
00084   // we have to make something up. Arbitrarily, choose ZeroOrOne.
00085   setBooleanContents(ZeroOrOneBooleanContent);
00086   // When comparing vectors the result sets the different elements in the
00087   // vector to all-one or all-zero.
00088   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00089 
00090   // Set up the register classes.
00091   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
00092   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
00093 
00094   if (Subtarget->hasFPARMv8()) {
00095     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
00096     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
00097     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
00098     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
00099   }
00100 
00101   if (Subtarget->hasNEON()) {
00102     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
00103     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
00104     // Someone set us up the NEON.
00105     addDRTypeForNEON(MVT::v2f32);
00106     addDRTypeForNEON(MVT::v8i8);
00107     addDRTypeForNEON(MVT::v4i16);
00108     addDRTypeForNEON(MVT::v2i32);
00109     addDRTypeForNEON(MVT::v1i64);
00110     addDRTypeForNEON(MVT::v1f64);
00111     addDRTypeForNEON(MVT::v4f16);
00112 
00113     addQRTypeForNEON(MVT::v4f32);
00114     addQRTypeForNEON(MVT::v2f64);
00115     addQRTypeForNEON(MVT::v16i8);
00116     addQRTypeForNEON(MVT::v8i16);
00117     addQRTypeForNEON(MVT::v4i32);
00118     addQRTypeForNEON(MVT::v2i64);
00119     addQRTypeForNEON(MVT::v8f16);
00120   }
00121 
00122   // Compute derived properties from the register classes
00123   computeRegisterProperties();
00124 
00125   // Provide all sorts of operation actions
00126   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
00127   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00128   setOperationAction(ISD::SETCC, MVT::i32, Custom);
00129   setOperationAction(ISD::SETCC, MVT::i64, Custom);
00130   setOperationAction(ISD::SETCC, MVT::f32, Custom);
00131   setOperationAction(ISD::SETCC, MVT::f64, Custom);
00132   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
00133   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
00134   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
00135   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
00136   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
00137   setOperationAction(ISD::SELECT, MVT::i32, Custom);
00138   setOperationAction(ISD::SELECT, MVT::i64, Custom);
00139   setOperationAction(ISD::SELECT, MVT::f32, Custom);
00140   setOperationAction(ISD::SELECT, MVT::f64, Custom);
00141   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
00142   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
00143   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
00144   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
00145   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
00146   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
00147 
00148   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
00149   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
00150   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
00151 
00152   setOperationAction(ISD::FREM, MVT::f32, Expand);
00153   setOperationAction(ISD::FREM, MVT::f64, Expand);
00154   setOperationAction(ISD::FREM, MVT::f80, Expand);
00155 
00156   // Custom lowering hooks are needed for XOR
00157   // to fold it into CSINC/CSINV.
00158   setOperationAction(ISD::XOR, MVT::i32, Custom);
00159   setOperationAction(ISD::XOR, MVT::i64, Custom);
00160 
00161   // Virtually no operation on f128 is legal, but LLVM can't expand them when
00162   // there's a valid register class, so we need custom operations in most cases.
00163   setOperationAction(ISD::FABS, MVT::f128, Expand);
00164   setOperationAction(ISD::FADD, MVT::f128, Custom);
00165   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
00166   setOperationAction(ISD::FCOS, MVT::f128, Expand);
00167   setOperationAction(ISD::FDIV, MVT::f128, Custom);
00168   setOperationAction(ISD::FMA, MVT::f128, Expand);
00169   setOperationAction(ISD::FMUL, MVT::f128, Custom);
00170   setOperationAction(ISD::FNEG, MVT::f128, Expand);
00171   setOperationAction(ISD::FPOW, MVT::f128, Expand);
00172   setOperationAction(ISD::FREM, MVT::f128, Expand);
00173   setOperationAction(ISD::FRINT, MVT::f128, Expand);
00174   setOperationAction(ISD::FSIN, MVT::f128, Expand);
00175   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
00176   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
00177   setOperationAction(ISD::FSUB, MVT::f128, Custom);
00178   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
00179   setOperationAction(ISD::SETCC, MVT::f128, Custom);
00180   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
00181   setOperationAction(ISD::SELECT, MVT::f128, Custom);
00182   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
00183   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
00184 
00185   // Lowering for many of the conversions is actually specified by the non-f128
00186   // type. The LowerXXX function will be trivial when f128 isn't involved.
00187   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
00188   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
00189   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
00190   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
00191   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
00192   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
00193   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
00194   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
00195   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
00196   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
00197   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
00198   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
00199   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
00200   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
00201 
00202   // Variable arguments.
00203   setOperationAction(ISD::VASTART, MVT::Other, Custom);
00204   setOperationAction(ISD::VAARG, MVT::Other, Custom);
00205   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
00206   setOperationAction(ISD::VAEND, MVT::Other, Expand);
00207 
00208   // Variable-sized objects.
00209   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
00210   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
00211   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
00212 
00213   // Exception handling.
00214   // FIXME: These are guesses. Has this been defined yet?
00215   setExceptionPointerRegister(AArch64::X0);
00216   setExceptionSelectorRegister(AArch64::X1);
00217 
00218   // Constant pool entries
00219   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
00220 
00221   // BlockAddress
00222   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
00223 
00224   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
00225   setOperationAction(ISD::ADDC, MVT::i32, Custom);
00226   setOperationAction(ISD::ADDE, MVT::i32, Custom);
00227   setOperationAction(ISD::SUBC, MVT::i32, Custom);
00228   setOperationAction(ISD::SUBE, MVT::i32, Custom);
00229   setOperationAction(ISD::ADDC, MVT::i64, Custom);
00230   setOperationAction(ISD::ADDE, MVT::i64, Custom);
00231   setOperationAction(ISD::SUBC, MVT::i64, Custom);
00232   setOperationAction(ISD::SUBE, MVT::i64, Custom);
00233 
00234   // AArch64 lacks both left-rotate and popcount instructions.
00235   setOperationAction(ISD::ROTL, MVT::i32, Expand);
00236   setOperationAction(ISD::ROTL, MVT::i64, Expand);
00237 
00238   // AArch64 doesn't have {U|S}MUL_LOHI.
00239   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
00240   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
00241 
00242 
00243   // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
00244   // counterparts, which AArch64 supports directly.
00245   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
00246   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
00247   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00248   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00249 
00250   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
00251   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
00252 
00253   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
00254   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
00255   setOperationAction(ISD::SREM, MVT::i32, Expand);
00256   setOperationAction(ISD::SREM, MVT::i64, Expand);
00257   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
00258   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
00259   setOperationAction(ISD::UREM, MVT::i32, Expand);
00260   setOperationAction(ISD::UREM, MVT::i64, Expand);
00261 
00262   // Custom lower Add/Sub/Mul with overflow.
00263   setOperationAction(ISD::SADDO, MVT::i32, Custom);
00264   setOperationAction(ISD::SADDO, MVT::i64, Custom);
00265   setOperationAction(ISD::UADDO, MVT::i32, Custom);
00266   setOperationAction(ISD::UADDO, MVT::i64, Custom);
00267   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
00268   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
00269   setOperationAction(ISD::USUBO, MVT::i32, Custom);
00270   setOperationAction(ISD::USUBO, MVT::i64, Custom);
00271   setOperationAction(ISD::SMULO, MVT::i32, Custom);
00272   setOperationAction(ISD::SMULO, MVT::i64, Custom);
00273   setOperationAction(ISD::UMULO, MVT::i32, Custom);
00274   setOperationAction(ISD::UMULO, MVT::i64, Custom);
00275 
00276   setOperationAction(ISD::FSIN, MVT::f32, Expand);
00277   setOperationAction(ISD::FSIN, MVT::f64, Expand);
00278   setOperationAction(ISD::FCOS, MVT::f32, Expand);
00279   setOperationAction(ISD::FCOS, MVT::f64, Expand);
00280   setOperationAction(ISD::FPOW, MVT::f32, Expand);
00281   setOperationAction(ISD::FPOW, MVT::f64, Expand);
00282   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00283   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00284 
00285   // f16 is storage-only, so we promote operations to f32 if we know this is
00286   // valid, and ignore them otherwise. The operations not mentioned here will
00287   // fail to select, but this is not a major problem as no source language
00288   // should be emitting native f16 operations yet.
00289   setOperationAction(ISD::FADD, MVT::f16, Promote);
00290   setOperationAction(ISD::FDIV, MVT::f16, Promote);
00291   setOperationAction(ISD::FMUL, MVT::f16, Promote);
00292   setOperationAction(ISD::FSUB, MVT::f16, Promote);
00293 
00294   // v4f16 is also a storage-only type, so promote it to v4f32 when that is
00295   // known to be safe.
00296   setOperationAction(ISD::FADD, MVT::v4f16, Promote);
00297   setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
00298   setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
00299   setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
00300   setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
00301   setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
00302   AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
00303   AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
00304   AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
00305   AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
00306   AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
00307   AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
00308 
00309   // Expand all other v4f16 operations.
00310   // FIXME: We could generate better code by promoting some operations to
00311   // a pair of v4f32s
00312   setOperationAction(ISD::FABS, MVT::v4f16, Expand);
00313   setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
00314   setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
00315   setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
00316   setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
00317   setOperationAction(ISD::FMA, MVT::v4f16, Expand);
00318   setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
00319   setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
00320   setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
00321   setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
00322   setOperationAction(ISD::FREM, MVT::v4f16, Expand);
00323   setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
00324   setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
00325   setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
00326   setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
00327   setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
00328   setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
00329   setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
00330   setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
00331   setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
00332   setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
00333   setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
00334   setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
00335   setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
00336   setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
00337   setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
00338 
00339 
00340   // v8f16 is also a storage-only type, so expand it.
00341   setOperationAction(ISD::FABS, MVT::v8f16, Expand);
00342   setOperationAction(ISD::FADD, MVT::v8f16, Expand);
00343   setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
00344   setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
00345   setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
00346   setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
00347   setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
00348   setOperationAction(ISD::FMA, MVT::v8f16, Expand);
00349   setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
00350   setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
00351   setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
00352   setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
00353   setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
00354   setOperationAction(ISD::FREM, MVT::v8f16, Expand);
00355   setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
00356   setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
00357   setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
00358   setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
00359   setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
00360   setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
00361   setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
00362   setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
00363   setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
00364   setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
00365   setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
00366   setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
00367   setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
00368   setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
00369   setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
00370   setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
00371   setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
00372 
00373   // AArch64 has implementations of a lot of rounding-like FP operations.
00374   static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
00375   for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
00376     MVT Ty = RoundingTypes[I];
00377     setOperationAction(ISD::FFLOOR, Ty, Legal);
00378     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
00379     setOperationAction(ISD::FCEIL, Ty, Legal);
00380     setOperationAction(ISD::FRINT, Ty, Legal);
00381     setOperationAction(ISD::FTRUNC, Ty, Legal);
00382     setOperationAction(ISD::FROUND, Ty, Legal);
00383   }
00384 
00385   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
00386 
00387   if (Subtarget->isTargetMachO()) {
00388     // For iOS, we don't want to the normal expansion of a libcall to
00389     // sincos. We want to issue a libcall to __sincos_stret to avoid memory
00390     // traffic.
00391     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
00392     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
00393   } else {
00394     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00395     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00396   }
00397 
00398   // AArch64 does not have floating-point extending loads, i1 sign-extending
00399   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
00400   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00401   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
00402   setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
00403   setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
00404   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
00405   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00406   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00407   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00408   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
00409   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
00410   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
00411   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
00412 
00413   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
00414   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
00415 
00416   // Indexed loads and stores are supported.
00417   for (unsigned im = (unsigned)ISD::PRE_INC;
00418        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
00419     setIndexedLoadAction(im, MVT::i8, Legal);
00420     setIndexedLoadAction(im, MVT::i16, Legal);
00421     setIndexedLoadAction(im, MVT::i32, Legal);
00422     setIndexedLoadAction(im, MVT::i64, Legal);
00423     setIndexedLoadAction(im, MVT::f64, Legal);
00424     setIndexedLoadAction(im, MVT::f32, Legal);
00425     setIndexedStoreAction(im, MVT::i8, Legal);
00426     setIndexedStoreAction(im, MVT::i16, Legal);
00427     setIndexedStoreAction(im, MVT::i32, Legal);
00428     setIndexedStoreAction(im, MVT::i64, Legal);
00429     setIndexedStoreAction(im, MVT::f64, Legal);
00430     setIndexedStoreAction(im, MVT::f32, Legal);
00431   }
00432 
00433   // Trap.
00434   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00435 
00436   // We combine OR nodes for bitfield operations.
00437   setTargetDAGCombine(ISD::OR);
00438 
00439   // Vector add and sub nodes may conceal a high-half opportunity.
00440   // Also, try to fold ADD into CSINC/CSINV..
00441   setTargetDAGCombine(ISD::ADD);
00442   setTargetDAGCombine(ISD::SUB);
00443 
00444   setTargetDAGCombine(ISD::XOR);
00445   setTargetDAGCombine(ISD::SINT_TO_FP);
00446   setTargetDAGCombine(ISD::UINT_TO_FP);
00447 
00448   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
00449 
00450   setTargetDAGCombine(ISD::ANY_EXTEND);
00451   setTargetDAGCombine(ISD::ZERO_EXTEND);
00452   setTargetDAGCombine(ISD::SIGN_EXTEND);
00453   setTargetDAGCombine(ISD::BITCAST);
00454   setTargetDAGCombine(ISD::CONCAT_VECTORS);
00455   setTargetDAGCombine(ISD::STORE);
00456 
00457   setTargetDAGCombine(ISD::MUL);
00458 
00459   setTargetDAGCombine(ISD::SELECT);
00460   setTargetDAGCombine(ISD::VSELECT);
00461 
00462   setTargetDAGCombine(ISD::INTRINSIC_VOID);
00463   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
00464   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
00465 
00466   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
00467   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
00468   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
00469 
00470   setStackPointerRegisterToSaveRestore(AArch64::SP);
00471 
00472   setSchedulingPreference(Sched::Hybrid);
00473 
00474   // Enable TBZ/TBNZ
00475   MaskAndBranchFoldingIsLegal = true;
00476 
00477   setMinFunctionAlignment(2);
00478 
00479   RequireStrictAlign = (Align == StrictAlign);
00480 
00481   setHasExtractBitsInsn(true);
00482 
00483   if (Subtarget->hasNEON()) {
00484     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
00485     // silliness like this:
00486     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
00487     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
00488     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
00489     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
00490     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
00491     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
00492     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
00493     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
00494     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
00495     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
00496     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
00497     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
00498     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
00499     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
00500     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
00501     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
00502     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
00503     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
00504     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
00505     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
00506     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
00507     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
00508     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
00509     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
00510     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
00511 
00512     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
00513     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
00514     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
00515     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
00516     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
00517 
00518     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
00519 
00520     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
00521     // elements smaller than i32, so promote the input to i32 first.
00522     setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
00523     setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
00524     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
00525     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
00526     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
00527     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
00528     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
00529     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
00530     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
00531 
00532     // AArch64 doesn't have MUL.2d:
00533     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
00534     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
00535     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
00536     // Likewise, narrowing and extending vector loads/stores aren't handled
00537     // directly.
00538     for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
00539          VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
00540 
00541       setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
00542                          Expand);
00543 
00544       setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
00545       setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
00546       setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
00547       setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
00548 
00549       setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
00550 
00551       for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
00552            InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00553         setTruncStoreAction((MVT::SimpleValueType)VT,
00554                             (MVT::SimpleValueType)InnerVT, Expand);
00555       setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
00556       setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
00557       setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
00558     }
00559 
00560     // AArch64 has implementations of a lot of rounding-like FP operations.
00561     static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
00562     for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
00563       MVT Ty = RoundingVecTypes[I];
00564       setOperationAction(ISD::FFLOOR, Ty, Legal);
00565       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
00566       setOperationAction(ISD::FCEIL, Ty, Legal);
00567       setOperationAction(ISD::FRINT, Ty, Legal);
00568       setOperationAction(ISD::FTRUNC, Ty, Legal);
00569       setOperationAction(ISD::FROUND, Ty, Legal);
00570     }
00571   }
00572 
00573   // Prefer likely predicted branches to selects on out-of-order cores.
00574   if (Subtarget->isCortexA57())
00575     PredictableSelectIsExpensive = true;
00576 }
00577 
00578 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
00579   if (VT == MVT::v2f32 || VT == MVT::v4f16) {
00580     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
00581     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
00582 
00583     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
00584     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
00585   } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
00586     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
00587     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
00588 
00589     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
00590     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
00591   }
00592 
00593   // Mark vector float intrinsics as expand.
00594   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
00595     setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
00596     setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
00597     setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
00598     setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
00599     setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
00600     setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
00601     setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
00602     setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
00603     setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
00604   }
00605 
00606   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
00607   setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
00608   setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
00609   setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
00610   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
00611   setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
00612   setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
00613   setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
00614   setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
00615   setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
00616   setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
00617   setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
00618 
00619   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
00620   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
00621   setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
00622   setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
00623 
00624   // CNT supports only B element sizes.
00625   if (VT != MVT::v8i8 && VT != MVT::v16i8)
00626     setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
00627 
00628   setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
00629   setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
00630   setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
00631   setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
00632   setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
00633 
00634   setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
00635   setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
00636 
00637   if (Subtarget->isLittleEndian()) {
00638     for (unsigned im = (unsigned)ISD::PRE_INC;
00639          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
00640       setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
00641       setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
00642     }
00643   }
00644 }
00645 
00646 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
00647   addRegisterClass(VT, &AArch64::FPR64RegClass);
00648   addTypeForNEON(VT, MVT::v2i32);
00649 }
00650 
00651 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
00652   addRegisterClass(VT, &AArch64::FPR128RegClass);
00653   addTypeForNEON(VT, MVT::v4i32);
00654 }
00655 
00656 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
00657   if (!VT.isVector())
00658     return MVT::i32;
00659   return VT.changeVectorElementTypeToInteger();
00660 }
00661 
00662 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
00663 /// Mask are known to be either zero or one and return them in the
00664 /// KnownZero/KnownOne bitsets.
00665 void AArch64TargetLowering::computeKnownBitsForTargetNode(
00666     const SDValue Op, APInt &KnownZero, APInt &KnownOne,
00667     const SelectionDAG &DAG, unsigned Depth) const {
00668   switch (Op.getOpcode()) {
00669   default:
00670     break;
00671   case AArch64ISD::CSEL: {
00672     APInt KnownZero2, KnownOne2;
00673     DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
00674     DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
00675     KnownZero &= KnownZero2;
00676     KnownOne &= KnownOne2;
00677     break;
00678   }
00679   case ISD::INTRINSIC_W_CHAIN: {
00680    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
00681     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
00682     switch (IntID) {
00683     default: return;
00684     case Intrinsic::aarch64_ldaxr:
00685     case Intrinsic::aarch64_ldxr: {
00686       unsigned BitWidth = KnownOne.getBitWidth();
00687       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
00688       unsigned MemBits = VT.getScalarType().getSizeInBits();
00689       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
00690       return;
00691     }
00692     }
00693     break;
00694   }
00695   case ISD::INTRINSIC_WO_CHAIN:
00696   case ISD::INTRINSIC_VOID: {
00697     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
00698     switch (IntNo) {
00699     default:
00700       break;
00701     case Intrinsic::aarch64_neon_umaxv:
00702     case Intrinsic::aarch64_neon_uminv: {
00703       // Figure out the datatype of the vector operand. The UMINV instruction
00704       // will zero extend the result, so we can mark as known zero all the
00705       // bits larger than the element datatype. 32-bit or larget doesn't need
00706       // this as those are legal types and will be handled by isel directly.
00707       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
00708       unsigned BitWidth = KnownZero.getBitWidth();
00709       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
00710         assert(BitWidth >= 8 && "Unexpected width!");
00711         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
00712         KnownZero |= Mask;
00713       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
00714         assert(BitWidth >= 16 && "Unexpected width!");
00715         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
00716         KnownZero |= Mask;
00717       }
00718       break;
00719     } break;
00720     }
00721   }
00722   }
00723 }
00724 
00725 MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
00726   return MVT::i64;
00727 }
00728 
00729 unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
00730   // FIXME: On AArch64, this depends on the type.
00731   // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
00732   // and the offset has to be a multiple of the related size in bytes.
00733   return 4095;
00734 }
00735 
00736 FastISel *
00737 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
00738                                       const TargetLibraryInfo *libInfo) const {
00739   return AArch64::createFastISel(funcInfo, libInfo);
00740 }
00741 
00742 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
00743   switch (Opcode) {
00744   default:
00745     return nullptr;
00746   case AArch64ISD::CALL:              return "AArch64ISD::CALL";
00747   case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
00748   case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
00749   case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
00750   case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
00751   case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
00752   case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
00753   case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
00754   case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
00755   case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
00756   case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
00757   case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
00758   case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
00759   case AArch64ISD::ADC:               return "AArch64ISD::ADC";
00760   case AArch64ISD::SBC:               return "AArch64ISD::SBC";
00761   case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
00762   case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
00763   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
00764   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
00765   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
00766   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
00767   case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
00768   case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
00769   case AArch64ISD::DUP:               return "AArch64ISD::DUP";
00770   case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
00771   case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
00772   case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
00773   case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
00774   case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
00775   case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
00776   case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
00777   case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
00778   case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
00779   case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
00780   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
00781   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
00782   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
00783   case AArch64ISD::BSL:               return "AArch64ISD::BSL";
00784   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
00785   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
00786   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
00787   case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
00788   case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
00789   case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
00790   case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
00791   case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
00792   case AArch64ISD::REV16:             return "AArch64ISD::REV16";
00793   case AArch64ISD::REV32:             return "AArch64ISD::REV32";
00794   case AArch64ISD::REV64:             return "AArch64ISD::REV64";
00795   case AArch64ISD::EXT:               return "AArch64ISD::EXT";
00796   case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
00797   case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
00798   case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
00799   case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
00800   case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
00801   case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
00802   case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
00803   case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
00804   case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
00805   case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
00806   case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
00807   case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
00808   case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
00809   case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
00810   case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
00811   case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
00812   case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
00813   case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
00814   case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
00815   case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
00816   case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
00817   case AArch64ISD::NOT:               return "AArch64ISD::NOT";
00818   case AArch64ISD::BIT:               return "AArch64ISD::BIT";
00819   case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
00820   case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
00821   case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
00822   case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
00823   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
00824   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
00825   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
00826   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
00827   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
00828   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
00829   case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
00830   case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
00831   case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
00832   case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
00833   case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
00834   case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
00835   case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
00836   case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
00837   case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
00838   case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
00839   case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
00840   case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
00841   case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
00842   case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
00843   case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
00844   case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
00845   case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
00846   case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
00847   case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
00848   case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
00849   case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
00850   case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
00851   case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
00852   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
00853   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
00854   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
00855   }
00856 }
00857 
00858 MachineBasicBlock *
00859 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
00860                                     MachineBasicBlock *MBB) const {
00861   // We materialise the F128CSEL pseudo-instruction as some control flow and a
00862   // phi node:
00863 
00864   // OrigBB:
00865   //     [... previous instrs leading to comparison ...]
00866   //     b.ne TrueBB
00867   //     b EndBB
00868   // TrueBB:
00869   //     ; Fallthrough
00870   // EndBB:
00871   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
00872 
00873   const TargetInstrInfo *TII =
00874       getTargetMachine().getSubtargetImpl()->getInstrInfo();
00875   MachineFunction *MF = MBB->getParent();
00876   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
00877   DebugLoc DL = MI->getDebugLoc();
00878   MachineFunction::iterator It = MBB;
00879   ++It;
00880 
00881   unsigned DestReg = MI->getOperand(0).getReg();
00882   unsigned IfTrueReg = MI->getOperand(1).getReg();
00883   unsigned IfFalseReg = MI->getOperand(2).getReg();
00884   unsigned CondCode = MI->getOperand(3).getImm();
00885   bool NZCVKilled = MI->getOperand(4).isKill();
00886 
00887   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
00888   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
00889   MF->insert(It, TrueBB);
00890   MF->insert(It, EndBB);
00891 
00892   // Transfer rest of current basic-block to EndBB
00893   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
00894                 MBB->end());
00895   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
00896 
00897   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
00898   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
00899   MBB->addSuccessor(TrueBB);
00900   MBB->addSuccessor(EndBB);
00901 
00902   // TrueBB falls through to the end.
00903   TrueBB->addSuccessor(EndBB);
00904 
00905   if (!NZCVKilled) {
00906     TrueBB->addLiveIn(AArch64::NZCV);
00907     EndBB->addLiveIn(AArch64::NZCV);
00908   }
00909 
00910   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
00911       .addReg(IfTrueReg)
00912       .addMBB(TrueBB)
00913       .addReg(IfFalseReg)
00914       .addMBB(MBB);
00915 
00916   MI->eraseFromParent();
00917   return EndBB;
00918 }
00919 
00920 MachineBasicBlock *
00921 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
00922                                                  MachineBasicBlock *BB) const {
00923   switch (MI->getOpcode()) {
00924   default:
00925 #ifndef NDEBUG
00926     MI->dump();
00927 #endif
00928     llvm_unreachable("Unexpected instruction for custom inserter!");
00929 
00930   case AArch64::F128CSEL:
00931     return EmitF128CSEL(MI, BB);
00932 
00933   case TargetOpcode::STACKMAP:
00934   case TargetOpcode::PATCHPOINT:
00935     return emitPatchPoint(MI, BB);
00936   }
00937 }
00938 
00939 //===----------------------------------------------------------------------===//
00940 // AArch64 Lowering private implementation.
00941 //===----------------------------------------------------------------------===//
00942 
00943 //===----------------------------------------------------------------------===//
00944 // Lowering Code
00945 //===----------------------------------------------------------------------===//
00946 
00947 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
00948 /// CC
00949 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
00950   switch (CC) {
00951   default:
00952     llvm_unreachable("Unknown condition code!");
00953   case ISD::SETNE:
00954     return AArch64CC::NE;
00955   case ISD::SETEQ:
00956     return AArch64CC::EQ;
00957   case ISD::SETGT:
00958     return AArch64CC::GT;
00959   case ISD::SETGE:
00960     return AArch64CC::GE;
00961   case ISD::SETLT:
00962     return AArch64CC::LT;
00963   case ISD::SETLE:
00964     return AArch64CC::LE;
00965   case ISD::SETUGT:
00966     return AArch64CC::HI;
00967   case ISD::SETUGE:
00968     return AArch64CC::HS;
00969   case ISD::SETULT:
00970     return AArch64CC::LO;
00971   case ISD::SETULE:
00972     return AArch64CC::LS;
00973   }
00974 }
00975 
00976 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
00977 static void changeFPCCToAArch64CC(ISD::CondCode CC,
00978                                   AArch64CC::CondCode &CondCode,
00979                                   AArch64CC::CondCode &CondCode2) {
00980   CondCode2 = AArch64CC::AL;
00981   switch (CC) {
00982   default:
00983     llvm_unreachable("Unknown FP condition!");
00984   case ISD::SETEQ:
00985   case ISD::SETOEQ:
00986     CondCode = AArch64CC::EQ;
00987     break;
00988   case ISD::SETGT:
00989   case ISD::SETOGT:
00990     CondCode = AArch64CC::GT;
00991     break;
00992   case ISD::SETGE:
00993   case ISD::SETOGE:
00994     CondCode = AArch64CC::GE;
00995     break;
00996   case ISD::SETOLT:
00997     CondCode = AArch64CC::MI;
00998     break;
00999   case ISD::SETOLE:
01000     CondCode = AArch64CC::LS;
01001     break;
01002   case ISD::SETONE:
01003     CondCode = AArch64CC::MI;
01004     CondCode2 = AArch64CC::GT;
01005     break;
01006   case ISD::SETO:
01007     CondCode = AArch64CC::VC;
01008     break;
01009   case ISD::SETUO:
01010     CondCode = AArch64CC::VS;
01011     break;
01012   case ISD::SETUEQ:
01013     CondCode = AArch64CC::EQ;
01014     CondCode2 = AArch64CC::VS;
01015     break;
01016   case ISD::SETUGT:
01017     CondCode = AArch64CC::HI;
01018     break;
01019   case ISD::SETUGE:
01020     CondCode = AArch64CC::PL;
01021     break;
01022   case ISD::SETLT:
01023   case ISD::SETULT:
01024     CondCode = AArch64CC::LT;
01025     break;
01026   case ISD::SETLE:
01027   case ISD::SETULE:
01028     CondCode = AArch64CC::LE;
01029     break;
01030   case ISD::SETNE:
01031   case ISD::SETUNE:
01032     CondCode = AArch64CC::NE;
01033     break;
01034   }
01035 }
01036 
01037 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
01038 /// CC usable with the vector instructions. Fewer operations are available
01039 /// without a real NZCV register, so we have to use less efficient combinations
01040 /// to get the same effect.
01041 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
01042                                         AArch64CC::CondCode &CondCode,
01043                                         AArch64CC::CondCode &CondCode2,
01044                                         bool &Invert) {
01045   Invert = false;
01046   switch (CC) {
01047   default:
01048     // Mostly the scalar mappings work fine.
01049     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
01050     break;
01051   case ISD::SETUO:
01052     Invert = true; // Fallthrough
01053   case ISD::SETO:
01054     CondCode = AArch64CC::MI;
01055     CondCode2 = AArch64CC::GE;
01056     break;
01057   case ISD::SETUEQ:
01058   case ISD::SETULT:
01059   case ISD::SETULE:
01060   case ISD::SETUGT:
01061   case ISD::SETUGE:
01062     // All of the compare-mask comparisons are ordered, but we can switch
01063     // between the two by a double inversion. E.g. ULE == !OGT.
01064     Invert = true;
01065     changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
01066     break;
01067   }
01068 }
01069 
01070 static bool isLegalArithImmed(uint64_t C) {
01071   // Matches AArch64DAGToDAGISel::SelectArithImmed().
01072   return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
01073 }
01074 
01075 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
01076                               SDLoc dl, SelectionDAG &DAG) {
01077   EVT VT = LHS.getValueType();
01078 
01079   if (VT.isFloatingPoint())
01080     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
01081 
01082   // The CMP instruction is just an alias for SUBS, and representing it as
01083   // SUBS means that it's possible to get CSE with subtract operations.
01084   // A later phase can perform the optimization of setting the destination
01085   // register to WZR/XZR if it ends up being unused.
01086   unsigned Opcode = AArch64ISD::SUBS;
01087 
01088   if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
01089       cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
01090       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
01091     // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
01092     // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
01093     // can be set differently by this operation. It comes down to whether
01094     // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
01095     // everything is fine. If not then the optimization is wrong. Thus general
01096     // comparisons are only valid if op2 != 0.
01097 
01098     // So, finally, the only LLVM-native comparisons that don't mention C and V
01099     // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
01100     // the absence of information about op2.
01101     Opcode = AArch64ISD::ADDS;
01102     RHS = RHS.getOperand(1);
01103   } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
01104              cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
01105              !isUnsignedIntSetCC(CC)) {
01106     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
01107     // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
01108     // of the signed comparisons.
01109     Opcode = AArch64ISD::ANDS;
01110     RHS = LHS.getOperand(1);
01111     LHS = LHS.getOperand(0);
01112   }
01113 
01114   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
01115       .getValue(1);
01116 }
01117 
01118 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
01119                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
01120   SDValue Cmp;
01121   AArch64CC::CondCode AArch64CC;
01122   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
01123     EVT VT = RHS.getValueType();
01124     uint64_t C = RHSC->getZExtValue();
01125     if (!isLegalArithImmed(C)) {
01126       // Constant does not fit, try adjusting it by one?
01127       switch (CC) {
01128       default:
01129         break;
01130       case ISD::SETLT:
01131       case ISD::SETGE:
01132         if ((VT == MVT::i32 && C != 0x80000000 &&
01133              isLegalArithImmed((uint32_t)(C - 1))) ||
01134             (VT == MVT::i64 && C != 0x80000000ULL &&
01135              isLegalArithImmed(C - 1ULL))) {
01136           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
01137           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
01138           RHS = DAG.getConstant(C, VT);
01139         }
01140         break;
01141       case ISD::SETULT:
01142       case ISD::SETUGE:
01143         if ((VT == MVT::i32 && C != 0 &&
01144              isLegalArithImmed((uint32_t)(C - 1))) ||
01145             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
01146           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
01147           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
01148           RHS = DAG.getConstant(C, VT);
01149         }
01150         break;
01151       case ISD::SETLE:
01152       case ISD::SETGT:
01153         if ((VT == MVT::i32 && C != 0x7fffffff &&
01154              isLegalArithImmed((uint32_t)(C + 1))) ||
01155             (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
01156              isLegalArithImmed(C + 1ULL))) {
01157           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
01158           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
01159           RHS = DAG.getConstant(C, VT);
01160         }
01161         break;
01162       case ISD::SETULE:
01163       case ISD::SETUGT:
01164         if ((VT == MVT::i32 && C != 0xffffffff &&
01165              isLegalArithImmed((uint32_t)(C + 1))) ||
01166             (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
01167              isLegalArithImmed(C + 1ULL))) {
01168           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
01169           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
01170           RHS = DAG.getConstant(C, VT);
01171         }
01172         break;
01173       }
01174     }
01175   }
01176   // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
01177   // For the i8 operand, the largest immediate is 255, so this can be easily
01178   // encoded in the compare instruction. For the i16 operand, however, the
01179   // largest immediate cannot be encoded in the compare.
01180   // Therefore, use a sign extending load and cmn to avoid materializing the -1
01181   // constant. For example,
01182   // movz w1, #65535
01183   // ldrh w0, [x0, #0]
01184   // cmp w0, w1
01185   // >
01186   // ldrsh w0, [x0, #0]
01187   // cmn w0, #1
01188   // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
01189   // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
01190   // both the LHS and RHS are truely zero extended and to make sure the
01191   // transformation is profitable.
01192   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
01193     if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
01194         isa<LoadSDNode>(LHS)) {
01195       if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
01196           cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
01197           LHS.getNode()->hasNUsesOfValue(1, 0)) {
01198         int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
01199         if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
01200           SDValue SExt =
01201               DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
01202                           DAG.getValueType(MVT::i16));
01203           Cmp = emitComparison(SExt,
01204                                DAG.getConstant(ValueofRHS, RHS.getValueType()),
01205                                CC, dl, DAG);
01206           AArch64CC = changeIntCCToAArch64CC(CC);
01207           AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
01208           return Cmp;
01209         }
01210       }
01211     }
01212   }
01213   Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
01214   AArch64CC = changeIntCCToAArch64CC(CC);
01215   AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
01216   return Cmp;
01217 }
01218 
01219 static std::pair<SDValue, SDValue>
01220 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
01221   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
01222          "Unsupported value type");
01223   SDValue Value, Overflow;
01224   SDLoc DL(Op);
01225   SDValue LHS = Op.getOperand(0);
01226   SDValue RHS = Op.getOperand(1);
01227   unsigned Opc = 0;
01228   switch (Op.getOpcode()) {
01229   default:
01230     llvm_unreachable("Unknown overflow instruction!");
01231   case ISD::SADDO:
01232     Opc = AArch64ISD::ADDS;
01233     CC = AArch64CC::VS;
01234     break;
01235   case ISD::UADDO:
01236     Opc = AArch64ISD::ADDS;
01237     CC = AArch64CC::HS;
01238     break;
01239   case ISD::SSUBO:
01240     Opc = AArch64ISD::SUBS;
01241     CC = AArch64CC::VS;
01242     break;
01243   case ISD::USUBO:
01244     Opc = AArch64ISD::SUBS;
01245     CC = AArch64CC::LO;
01246     break;
01247   // Multiply needs a little bit extra work.
01248   case ISD::SMULO:
01249   case ISD::UMULO: {
01250     CC = AArch64CC::NE;
01251     bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
01252     if (Op.getValueType() == MVT::i32) {
01253       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
01254       // For a 32 bit multiply with overflow check we want the instruction
01255       // selector to generate a widening multiply (SMADDL/UMADDL). For that we
01256       // need to generate the following pattern:
01257       // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
01258       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
01259       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
01260       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
01261       SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
01262                                 DAG.getConstant(0, MVT::i64));
01263       // On AArch64 the upper 32 bits are always zero extended for a 32 bit
01264       // operation. We need to clear out the upper 32 bits, because we used a
01265       // widening multiply that wrote all 64 bits. In the end this should be a
01266       // noop.
01267       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
01268       if (IsSigned) {
01269         // The signed overflow check requires more than just a simple check for
01270         // any bit set in the upper 32 bits of the result. These bits could be
01271         // just the sign bits of a negative number. To perform the overflow
01272         // check we have to arithmetic shift right the 32nd bit of the result by
01273         // 31 bits. Then we compare the result to the upper 32 bits.
01274         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
01275                                         DAG.getConstant(32, MVT::i64));
01276         UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
01277         SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
01278                                         DAG.getConstant(31, MVT::i64));
01279         // It is important that LowerBits is last, otherwise the arithmetic
01280         // shift will not be folded into the compare (SUBS).
01281         SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
01282         Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
01283                        .getValue(1);
01284       } else {
01285         // The overflow check for unsigned multiply is easy. We only need to
01286         // check if any of the upper 32 bits are set. This can be done with a
01287         // CMP (shifted register). For that we need to generate the following
01288         // pattern:
01289         // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
01290         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
01291                                         DAG.getConstant(32, MVT::i64));
01292         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
01293         Overflow =
01294             DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
01295                         UpperBits).getValue(1);
01296       }
01297       break;
01298     }
01299     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
01300     // For the 64 bit multiply
01301     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
01302     if (IsSigned) {
01303       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
01304       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
01305                                       DAG.getConstant(63, MVT::i64));
01306       // It is important that LowerBits is last, otherwise the arithmetic
01307       // shift will not be folded into the compare (SUBS).
01308       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
01309       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
01310                      .getValue(1);
01311     } else {
01312       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
01313       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
01314       Overflow =
01315           DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
01316                       UpperBits).getValue(1);
01317     }
01318     break;
01319   }
01320   } // switch (...)
01321 
01322   if (Opc) {
01323     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
01324 
01325     // Emit the AArch64 operation with overflow check.
01326     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
01327     Overflow = Value.getValue(1);
01328   }
01329   return std::make_pair(Value, Overflow);
01330 }
01331 
01332 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
01333                                              RTLIB::Libcall Call) const {
01334   SmallVector<SDValue, 2> Ops;
01335   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
01336     Ops.push_back(Op.getOperand(i));
01337 
01338   return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
01339                      SDLoc(Op)).first;
01340 }
01341 
01342 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
01343   SDValue Sel = Op.getOperand(0);
01344   SDValue Other = Op.getOperand(1);
01345 
01346   // If neither operand is a SELECT_CC, give up.
01347   if (Sel.getOpcode() != ISD::SELECT_CC)
01348     std::swap(Sel, Other);
01349   if (Sel.getOpcode() != ISD::SELECT_CC)
01350     return Op;
01351 
01352   // The folding we want to perform is:
01353   // (xor x, (select_cc a, b, cc, 0, -1) )
01354   //   -->
01355   // (csel x, (xor x, -1), cc ...)
01356   //
01357   // The latter will get matched to a CSINV instruction.
01358 
01359   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
01360   SDValue LHS = Sel.getOperand(0);
01361   SDValue RHS = Sel.getOperand(1);
01362   SDValue TVal = Sel.getOperand(2);
01363   SDValue FVal = Sel.getOperand(3);
01364   SDLoc dl(Sel);
01365 
01366   // FIXME: This could be generalized to non-integer comparisons.
01367   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
01368     return Op;
01369 
01370   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
01371   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
01372 
01373   // The the values aren't constants, this isn't the pattern we're looking for.
01374   if (!CFVal || !CTVal)
01375     return Op;
01376 
01377   // We can commute the SELECT_CC by inverting the condition.  This
01378   // might be needed to make this fit into a CSINV pattern.
01379   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
01380     std::swap(TVal, FVal);
01381     std::swap(CTVal, CFVal);
01382     CC = ISD::getSetCCInverse(CC, true);
01383   }
01384 
01385   // If the constants line up, perform the transform!
01386   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
01387     SDValue CCVal;
01388     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
01389 
01390     FVal = Other;
01391     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
01392                        DAG.getConstant(-1ULL, Other.getValueType()));
01393 
01394     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
01395                        CCVal, Cmp);
01396   }
01397 
01398   return Op;
01399 }
01400 
01401 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
01402   EVT VT = Op.getValueType();
01403 
01404   // Let legalize expand this if it isn't a legal type yet.
01405   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
01406     return SDValue();
01407 
01408   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
01409 
01410   unsigned Opc;
01411   bool ExtraOp = false;
01412   switch (Op.getOpcode()) {
01413   default:
01414     llvm_unreachable("Invalid code");
01415   case ISD::ADDC:
01416     Opc = AArch64ISD::ADDS;
01417     break;
01418   case ISD::SUBC:
01419     Opc = AArch64ISD::SUBS;
01420     break;
01421   case ISD::ADDE:
01422     Opc = AArch64ISD::ADCS;
01423     ExtraOp = true;
01424     break;
01425   case ISD::SUBE:
01426     Opc = AArch64ISD::SBCS;
01427     ExtraOp = true;
01428     break;
01429   }
01430 
01431   if (!ExtraOp)
01432     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
01433   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
01434                      Op.getOperand(2));
01435 }
01436 
01437 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
01438   // Let legalize expand this if it isn't a legal type yet.
01439   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
01440     return SDValue();
01441 
01442   AArch64CC::CondCode CC;
01443   // The actual operation that sets the overflow or carry flag.
01444   SDValue Value, Overflow;
01445   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
01446 
01447   // We use 0 and 1 as false and true values.
01448   SDValue TVal = DAG.getConstant(1, MVT::i32);
01449   SDValue FVal = DAG.getConstant(0, MVT::i32);
01450 
01451   // We use an inverted condition, because the conditional select is inverted
01452   // too. This will allow it to be selected to a single instruction:
01453   // CSINC Wd, WZR, WZR, invert(cond).
01454   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
01455   Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
01456                          CCVal, Overflow);
01457 
01458   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
01459   return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
01460 }
01461 
01462 // Prefetch operands are:
01463 // 1: Address to prefetch
01464 // 2: bool isWrite
01465 // 3: int locality (0 = no locality ... 3 = extreme locality)
01466 // 4: bool isDataCache
01467 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
01468   SDLoc DL(Op);
01469   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
01470   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
01471   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
01472 
01473   bool IsStream = !Locality;
01474   // When the locality number is set
01475   if (Locality) {
01476     // The front-end should have filtered out the out-of-range values
01477     assert(Locality <= 3 && "Prefetch locality out-of-range");
01478     // The locality degree is the opposite of the cache speed.
01479     // Put the number the other way around.
01480     // The encoding starts at 0 for level 1
01481     Locality = 3 - Locality;
01482   }
01483 
01484   // built the mask value encoding the expected behavior.
01485   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
01486                    (!IsData << 3) |     // IsDataCache bit
01487                    (Locality << 1) |    // Cache level bits
01488                    (unsigned)IsStream;  // Stream bit
01489   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
01490                      DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
01491 }
01492 
01493 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
01494                                               SelectionDAG &DAG) const {
01495   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
01496 
01497   RTLIB::Libcall LC;
01498   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
01499 
01500   return LowerF128Call(Op, DAG, LC);
01501 }
01502 
01503 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
01504                                              SelectionDAG &DAG) const {
01505   if (Op.getOperand(0).getValueType() != MVT::f128) {
01506     // It's legal except when f128 is involved
01507     return Op;
01508   }
01509 
01510   RTLIB::Libcall LC;
01511   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
01512 
01513   // FP_ROUND node has a second operand indicating whether it is known to be
01514   // precise. That doesn't take part in the LibCall so we can't directly use
01515   // LowerF128Call.
01516   SDValue SrcVal = Op.getOperand(0);
01517   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
01518                      /*isSigned*/ false, SDLoc(Op)).first;
01519 }
01520 
01521 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
01522   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
01523   // Any additional optimization in this function should be recorded
01524   // in the cost tables.
01525   EVT InVT = Op.getOperand(0).getValueType();
01526   EVT VT = Op.getValueType();
01527 
01528   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
01529     SDLoc dl(Op);
01530     SDValue Cv =
01531         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
01532                     Op.getOperand(0));
01533     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
01534   }
01535 
01536   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
01537     SDLoc dl(Op);
01538     MVT ExtVT =
01539         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
01540                          VT.getVectorNumElements());
01541     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
01542     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
01543   }
01544 
01545   // Type changing conversions are illegal.
01546   return Op;
01547 }
01548 
01549 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
01550                                               SelectionDAG &DAG) const {
01551   if (Op.getOperand(0).getValueType().isVector())
01552     return LowerVectorFP_TO_INT(Op, DAG);
01553 
01554   if (Op.getOperand(0).getValueType() != MVT::f128) {
01555     // It's legal except when f128 is involved
01556     return Op;
01557   }
01558 
01559   RTLIB::Libcall LC;
01560   if (Op.getOpcode() == ISD::FP_TO_SINT)
01561     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
01562   else
01563     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
01564 
01565   SmallVector<SDValue, 2> Ops;
01566   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
01567     Ops.push_back(Op.getOperand(i));
01568 
01569   return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
01570                      SDLoc(Op)).first;
01571 }
01572 
01573 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
01574   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
01575   // Any additional optimization in this function should be recorded
01576   // in the cost tables.
01577   EVT VT = Op.getValueType();
01578   SDLoc dl(Op);
01579   SDValue In = Op.getOperand(0);
01580   EVT InVT = In.getValueType();
01581 
01582   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
01583     MVT CastVT =
01584         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
01585                          InVT.getVectorNumElements());
01586     In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
01587     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
01588   }
01589 
01590   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
01591     unsigned CastOpc =
01592         Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
01593     EVT CastVT = VT.changeVectorElementTypeToInteger();
01594     In = DAG.getNode(CastOpc, dl, CastVT, In);
01595     return DAG.getNode(Op.getOpcode(), dl, VT, In);
01596   }
01597 
01598   return Op;
01599 }
01600 
01601 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
01602                                             SelectionDAG &DAG) const {
01603   if (Op.getValueType().isVector())
01604     return LowerVectorINT_TO_FP(Op, DAG);
01605 
01606   // i128 conversions are libcalls.
01607   if (Op.getOperand(0).getValueType() == MVT::i128)
01608     return SDValue();
01609 
01610   // Other conversions are legal, unless it's to the completely software-based
01611   // fp128.
01612   if (Op.getValueType() != MVT::f128)
01613     return Op;
01614 
01615   RTLIB::Libcall LC;
01616   if (Op.getOpcode() == ISD::SINT_TO_FP)
01617     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
01618   else
01619     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
01620 
01621   return LowerF128Call(Op, DAG, LC);
01622 }
01623 
01624 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
01625                                             SelectionDAG &DAG) const {
01626   // For iOS, we want to call an alternative entry point: __sincos_stret,
01627   // which returns the values in two S / D registers.
01628   SDLoc dl(Op);
01629   SDValue Arg = Op.getOperand(0);
01630   EVT ArgVT = Arg.getValueType();
01631   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
01632 
01633   ArgListTy Args;
01634   ArgListEntry Entry;
01635 
01636   Entry.Node = Arg;
01637   Entry.Ty = ArgTy;
01638   Entry.isSExt = false;
01639   Entry.isZExt = false;
01640   Args.push_back(Entry);
01641 
01642   const char *LibcallName =
01643       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
01644   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
01645 
01646   StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
01647   TargetLowering::CallLoweringInfo CLI(DAG);
01648   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
01649     .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
01650 
01651   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
01652   return CallResult.first;
01653 }
01654 
01655 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
01656   if (Op.getValueType() != MVT::f16)
01657     return SDValue();
01658 
01659   assert(Op.getOperand(0).getValueType() == MVT::i16);
01660   SDLoc DL(Op);
01661 
01662   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
01663   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
01664   return SDValue(
01665       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
01666                          DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
01667       0);
01668 }
01669 
01670 
01671 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
01672                                               SelectionDAG &DAG) const {
01673   switch (Op.getOpcode()) {
01674   default:
01675     llvm_unreachable("unimplemented operand");
01676     return SDValue();
01677   case ISD::BITCAST:
01678     return LowerBITCAST(Op, DAG);
01679   case ISD::GlobalAddress:
01680     return LowerGlobalAddress(Op, DAG);
01681   case ISD::GlobalTLSAddress:
01682     return LowerGlobalTLSAddress(Op, DAG);
01683   case ISD::SETCC:
01684     return LowerSETCC(Op, DAG);
01685   case ISD::BR_CC:
01686     return LowerBR_CC(Op, DAG);
01687   case ISD::SELECT:
01688     return LowerSELECT(Op, DAG);
01689   case ISD::SELECT_CC:
01690     return LowerSELECT_CC(Op, DAG);
01691   case ISD::JumpTable:
01692     return LowerJumpTable(Op, DAG);
01693   case ISD::ConstantPool:
01694     return LowerConstantPool(Op, DAG);
01695   case ISD::BlockAddress:
01696     return LowerBlockAddress(Op, DAG);
01697   case ISD::VASTART:
01698     return LowerVASTART(Op, DAG);
01699   case ISD::VACOPY:
01700     return LowerVACOPY(Op, DAG);
01701   case ISD::VAARG:
01702     return LowerVAARG(Op, DAG);
01703   case ISD::ADDC:
01704   case ISD::ADDE:
01705   case ISD::SUBC:
01706   case ISD::SUBE:
01707     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
01708   case ISD::SADDO:
01709   case ISD::UADDO:
01710   case ISD::SSUBO:
01711   case ISD::USUBO:
01712   case ISD::SMULO:
01713   case ISD::UMULO:
01714     return LowerXALUO(Op, DAG);
01715   case ISD::FADD:
01716     return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
01717   case ISD::FSUB:
01718     return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
01719   case ISD::FMUL:
01720     return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
01721   case ISD::FDIV:
01722     return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
01723   case ISD::FP_ROUND:
01724     return LowerFP_ROUND(Op, DAG);
01725   case ISD::FP_EXTEND:
01726     return LowerFP_EXTEND(Op, DAG);
01727   case ISD::FRAMEADDR:
01728     return LowerFRAMEADDR(Op, DAG);
01729   case ISD::RETURNADDR:
01730     return LowerRETURNADDR(Op, DAG);
01731   case ISD::INSERT_VECTOR_ELT:
01732     return LowerINSERT_VECTOR_ELT(Op, DAG);
01733   case ISD::EXTRACT_VECTOR_ELT:
01734     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
01735   case ISD::BUILD_VECTOR:
01736     return LowerBUILD_VECTOR(Op, DAG);
01737   case ISD::VECTOR_SHUFFLE:
01738     return LowerVECTOR_SHUFFLE(Op, DAG);
01739   case ISD::EXTRACT_SUBVECTOR:
01740     return LowerEXTRACT_SUBVECTOR(Op, DAG);
01741   case ISD::SRA:
01742   case ISD::SRL:
01743   case ISD::SHL:
01744     return LowerVectorSRA_SRL_SHL(Op, DAG);
01745   case ISD::SHL_PARTS:
01746     return LowerShiftLeftParts(Op, DAG);
01747   case ISD::SRL_PARTS:
01748   case ISD::SRA_PARTS:
01749     return LowerShiftRightParts(Op, DAG);
01750   case ISD::CTPOP:
01751     return LowerCTPOP(Op, DAG);
01752   case ISD::FCOPYSIGN:
01753     return LowerFCOPYSIGN(Op, DAG);
01754   case ISD::AND:
01755     return LowerVectorAND(Op, DAG);
01756   case ISD::OR:
01757     return LowerVectorOR(Op, DAG);
01758   case ISD::XOR:
01759     return LowerXOR(Op, DAG);
01760   case ISD::PREFETCH:
01761     return LowerPREFETCH(Op, DAG);
01762   case ISD::SINT_TO_FP:
01763   case ISD::UINT_TO_FP:
01764     return LowerINT_TO_FP(Op, DAG);
01765   case ISD::FP_TO_SINT:
01766   case ISD::FP_TO_UINT:
01767     return LowerFP_TO_INT(Op, DAG);
01768   case ISD::FSINCOS:
01769     return LowerFSINCOS(Op, DAG);
01770   }
01771 }
01772 
01773 /// getFunctionAlignment - Return the Log2 alignment of this function.
01774 unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
01775   return 2;
01776 }
01777 
01778 //===----------------------------------------------------------------------===//
01779 //                      Calling Convention Implementation
01780 //===----------------------------------------------------------------------===//
01781 
01782 #include "AArch64GenCallingConv.inc"
01783 
01784 /// Selects the correct CCAssignFn for a given CallingConvention value.
01785 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
01786                                                      bool IsVarArg) const {
01787   switch (CC) {
01788   default:
01789     llvm_unreachable("Unsupported calling convention.");
01790   case CallingConv::WebKit_JS:
01791     return CC_AArch64_WebKit_JS;
01792   case CallingConv::C:
01793   case CallingConv::Fast:
01794     if (!Subtarget->isTargetDarwin())
01795       return CC_AArch64_AAPCS;
01796     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
01797   }
01798 }
01799 
01800 SDValue AArch64TargetLowering::LowerFormalArguments(
01801     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
01802     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
01803     SmallVectorImpl<SDValue> &InVals) const {
01804   MachineFunction &MF = DAG.getMachineFunction();
01805   MachineFrameInfo *MFI = MF.getFrameInfo();
01806 
01807   // Assign locations to all of the incoming arguments.
01808   SmallVector<CCValAssign, 16> ArgLocs;
01809   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
01810                  *DAG.getContext());
01811 
01812   // At this point, Ins[].VT may already be promoted to i32. To correctly
01813   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
01814   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
01815   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
01816   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
01817   // LocVT.
01818   unsigned NumArgs = Ins.size();
01819   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
01820   unsigned CurArgIdx = 0;
01821   for (unsigned i = 0; i != NumArgs; ++i) {
01822     MVT ValVT = Ins[i].VT;
01823     std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
01824     CurArgIdx = Ins[i].OrigArgIndex;
01825 
01826     // Get type of the original argument.
01827     EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
01828     MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
01829     // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
01830     if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
01831       ValVT = MVT::i8;
01832     else if (ActualMVT == MVT::i16)
01833       ValVT = MVT::i16;
01834 
01835     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
01836     bool Res =
01837         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
01838     assert(!Res && "Call operand has unhandled type");
01839     (void)Res;
01840   }
01841   assert(ArgLocs.size() == Ins.size());
01842   SmallVector<SDValue, 16> ArgValues;
01843   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
01844     CCValAssign &VA = ArgLocs[i];
01845 
01846     if (Ins[i].Flags.isByVal()) {
01847       // Byval is used for HFAs in the PCS, but the system should work in a
01848       // non-compliant manner for larger structs.
01849       EVT PtrTy = getPointerTy();
01850       int Size = Ins[i].Flags.getByValSize();
01851       unsigned NumRegs = (Size + 7) / 8;
01852 
01853       // FIXME: This works on big-endian for composite byvals, which are the common
01854       // case. It should also work for fundamental types too.
01855       unsigned FrameIdx =
01856         MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
01857       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
01858       InVals.push_back(FrameIdxN);
01859 
01860       continue;
01861     }
01862     
01863     if (VA.isRegLoc()) {
01864       // Arguments stored in registers.
01865       EVT RegVT = VA.getLocVT();
01866 
01867       SDValue ArgValue;
01868       const TargetRegisterClass *RC;
01869 
01870       if (RegVT == MVT::i32)
01871         RC = &AArch64::GPR32RegClass;
01872       else if (RegVT == MVT::i64)
01873         RC = &AArch64::GPR64RegClass;
01874       else if (RegVT == MVT::f16)
01875         RC = &AArch64::FPR16RegClass;
01876       else if (RegVT == MVT::f32)
01877         RC = &AArch64::FPR32RegClass;
01878       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
01879         RC = &AArch64::FPR64RegClass;
01880       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
01881         RC = &AArch64::FPR128RegClass;
01882       else
01883         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
01884 
01885       // Transform the arguments in physical registers into virtual ones.
01886       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
01887       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
01888 
01889       // If this is an 8, 16 or 32-bit value, it is really passed promoted
01890       // to 64 bits.  Insert an assert[sz]ext to capture this, then
01891       // truncate to the right size.
01892       switch (VA.getLocInfo()) {
01893       default:
01894         llvm_unreachable("Unknown loc info!");
01895       case CCValAssign::Full:
01896         break;
01897       case CCValAssign::BCvt:
01898         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
01899         break;
01900       case CCValAssign::AExt:
01901       case CCValAssign::SExt:
01902       case CCValAssign::ZExt:
01903         // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
01904         // nodes after our lowering.
01905         assert(RegVT == Ins[i].VT && "incorrect register location selected");
01906         break;
01907       }
01908 
01909       InVals.push_back(ArgValue);
01910 
01911     } else { // VA.isRegLoc()
01912       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
01913       unsigned ArgOffset = VA.getLocMemOffset();
01914       unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
01915 
01916       uint32_t BEAlign = 0;
01917       if (ArgSize < 8 && !Subtarget->isLittleEndian())
01918         BEAlign = 8 - ArgSize;
01919 
01920       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
01921 
01922       // Create load nodes to retrieve arguments from the stack.
01923       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
01924       SDValue ArgValue;
01925 
01926       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
01927       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
01928       MVT MemVT = VA.getValVT();
01929 
01930       switch (VA.getLocInfo()) {
01931       default:
01932         break;
01933       case CCValAssign::BCvt:
01934         MemVT = VA.getLocVT();
01935         break;
01936       case CCValAssign::SExt:
01937         ExtType = ISD::SEXTLOAD;
01938         break;
01939       case CCValAssign::ZExt:
01940         ExtType = ISD::ZEXTLOAD;
01941         break;
01942       case CCValAssign::AExt:
01943         ExtType = ISD::EXTLOAD;
01944         break;
01945       }
01946 
01947       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
01948                                 MachinePointerInfo::getFixedStack(FI),
01949                                 MemVT, false, false, false, 0, nullptr);
01950 
01951       InVals.push_back(ArgValue);
01952     }
01953   }
01954 
01955   // varargs
01956   if (isVarArg) {
01957     if (!Subtarget->isTargetDarwin()) {
01958       // The AAPCS variadic function ABI is identical to the non-variadic
01959       // one. As a result there may be more arguments in registers and we should
01960       // save them for future reference.
01961       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
01962     }
01963 
01964     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
01965     // This will point to the next argument passed via stack.
01966     unsigned StackOffset = CCInfo.getNextStackOffset();
01967     // We currently pass all varargs at 8-byte alignment.
01968     StackOffset = ((StackOffset + 7) & ~7);
01969     AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
01970   }
01971 
01972   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
01973   unsigned StackArgSize = CCInfo.getNextStackOffset();
01974   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
01975   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
01976     // This is a non-standard ABI so by fiat I say we're allowed to make full
01977     // use of the stack area to be popped, which must be aligned to 16 bytes in
01978     // any case:
01979     StackArgSize = RoundUpToAlignment(StackArgSize, 16);
01980 
01981     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
01982     // a multiple of 16.
01983     FuncInfo->setArgumentStackToRestore(StackArgSize);
01984 
01985     // This realignment carries over to the available bytes below. Our own
01986     // callers will guarantee the space is free by giving an aligned value to
01987     // CALLSEQ_START.
01988   }
01989   // Even if we're not expected to free up the space, it's useful to know how
01990   // much is there while considering tail calls (because we can reuse it).
01991   FuncInfo->setBytesInStackArgArea(StackArgSize);
01992 
01993   return Chain;
01994 }
01995 
01996 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
01997                                                 SelectionDAG &DAG, SDLoc DL,
01998                                                 SDValue &Chain) const {
01999   MachineFunction &MF = DAG.getMachineFunction();
02000   MachineFrameInfo *MFI = MF.getFrameInfo();
02001   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
02002 
02003   SmallVector<SDValue, 8> MemOps;
02004 
02005   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
02006                                           AArch64::X3, AArch64::X4, AArch64::X5,
02007                                           AArch64::X6, AArch64::X7 };
02008   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
02009   unsigned FirstVariadicGPR =
02010       CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
02011 
02012   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
02013   int GPRIdx = 0;
02014   if (GPRSaveSize != 0) {
02015     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
02016 
02017     SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
02018 
02019     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
02020       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
02021       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
02022       SDValue Store =
02023           DAG.getStore(Val.getValue(1), DL, Val, FIN,
02024                        MachinePointerInfo::getStack(i * 8), false, false, 0);
02025       MemOps.push_back(Store);
02026       FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
02027                         DAG.getConstant(8, getPointerTy()));
02028     }
02029   }
02030   FuncInfo->setVarArgsGPRIndex(GPRIdx);
02031   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
02032 
02033   if (Subtarget->hasFPARMv8()) {
02034     static const MCPhysReg FPRArgRegs[] = {
02035         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
02036         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
02037     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
02038     unsigned FirstVariadicFPR =
02039         CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
02040 
02041     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
02042     int FPRIdx = 0;
02043     if (FPRSaveSize != 0) {
02044       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
02045 
02046       SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
02047 
02048       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
02049         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
02050         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
02051 
02052         SDValue Store =
02053             DAG.getStore(Val.getValue(1), DL, Val, FIN,
02054                          MachinePointerInfo::getStack(i * 16), false, false, 0);
02055         MemOps.push_back(Store);
02056         FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
02057                           DAG.getConstant(16, getPointerTy()));
02058       }
02059     }
02060     FuncInfo->setVarArgsFPRIndex(FPRIdx);
02061     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
02062   }
02063 
02064   if (!MemOps.empty()) {
02065     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
02066   }
02067 }
02068 
02069 /// LowerCallResult - Lower the result values of a call into the
02070 /// appropriate copies out of appropriate physical registers.
02071 SDValue AArch64TargetLowering::LowerCallResult(
02072     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
02073     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
02074     SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
02075     SDValue ThisVal) const {
02076   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
02077                           ? RetCC_AArch64_WebKit_JS
02078                           : RetCC_AArch64_AAPCS;
02079   // Assign locations to each value returned by this call.
02080   SmallVector<CCValAssign, 16> RVLocs;
02081   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02082                  *DAG.getContext());
02083   CCInfo.AnalyzeCallResult(Ins, RetCC);
02084 
02085   // Copy all of the result registers out of their specified physreg.
02086   for (unsigned i = 0; i != RVLocs.size(); ++i) {
02087     CCValAssign VA = RVLocs[i];
02088 
02089     // Pass 'this' value directly from the argument to return value, to avoid
02090     // reg unit interference
02091     if (i == 0 && isThisReturn) {
02092       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
02093              "unexpected return calling convention register assignment");
02094       InVals.push_back(ThisVal);
02095       continue;
02096     }
02097 
02098     SDValue Val =
02099         DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
02100     Chain = Val.getValue(1);
02101     InFlag = Val.getValue(2);
02102 
02103     switch (VA.getLocInfo()) {
02104     default:
02105       llvm_unreachable("Unknown loc info!");
02106     case CCValAssign::Full:
02107       break;
02108     case CCValAssign::BCvt:
02109       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
02110       break;
02111     }
02112 
02113     InVals.push_back(Val);
02114   }
02115 
02116   return Chain;
02117 }
02118 
02119 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
02120     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
02121     bool isCalleeStructRet, bool isCallerStructRet,
02122     const SmallVectorImpl<ISD::OutputArg> &Outs,
02123     const SmallVectorImpl<SDValue> &OutVals,
02124     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
02125   // For CallingConv::C this function knows whether the ABI needs
02126   // changing. That's not true for other conventions so they will have to opt in
02127   // manually.
02128   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
02129     return false;
02130 
02131   const MachineFunction &MF = DAG.getMachineFunction();
02132   const Function *CallerF = MF.getFunction();
02133   CallingConv::ID CallerCC = CallerF->getCallingConv();
02134   bool CCMatch = CallerCC == CalleeCC;
02135 
02136   // Byval parameters hand the function a pointer directly into the stack area
02137   // we want to reuse during a tail call. Working around this *is* possible (see
02138   // X86) but less efficient and uglier in LowerCall.
02139   for (Function::const_arg_iterator i = CallerF->arg_begin(),
02140                                     e = CallerF->arg_end();
02141        i != e; ++i)
02142     if (i->hasByValAttr())
02143       return false;
02144 
02145   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
02146     if (IsTailCallConvention(CalleeCC) && CCMatch)
02147       return true;
02148     return false;
02149   }
02150 
02151   // Externally-defined functions with weak linkage should not be
02152   // tail-called on AArch64 when the OS does not support dynamic
02153   // pre-emption of symbols, as the AAELF spec requires normal calls
02154   // to undefined weak functions to be replaced with a NOP or jump to the
02155   // next instruction. The behaviour of branch instructions in this
02156   // situation (as used for tail calls) is implementation-defined, so we
02157   // cannot rely on the linker replacing the tail call with a return.
02158   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02159     const GlobalValue *GV = G->getGlobal();
02160     if (GV->hasExternalWeakLinkage())
02161       return false;
02162   }
02163 
02164   // Now we search for cases where we can use a tail call without changing the
02165   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
02166   // concept.
02167 
02168   // I want anyone implementing a new calling convention to think long and hard
02169   // about this assert.
02170   assert((!isVarArg || CalleeCC == CallingConv::C) &&
02171          "Unexpected variadic calling convention");
02172 
02173   if (isVarArg && !Outs.empty()) {
02174     // At least two cases here: if caller is fastcc then we can't have any
02175     // memory arguments (we'd be expected to clean up the stack afterwards). If
02176     // caller is C then we could potentially use its argument area.
02177 
02178     // FIXME: for now we take the most conservative of these in both cases:
02179     // disallow all variadic memory operands.
02180     SmallVector<CCValAssign, 16> ArgLocs;
02181     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
02182                    *DAG.getContext());
02183 
02184     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
02185     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
02186       if (!ArgLocs[i].isRegLoc())
02187         return false;
02188   }
02189 
02190   // If the calling conventions do not match, then we'd better make sure the
02191   // results are returned in the same way as what the caller expects.
02192   if (!CCMatch) {
02193     SmallVector<CCValAssign, 16> RVLocs1;
02194     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
02195                     *DAG.getContext());
02196     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
02197 
02198     SmallVector<CCValAssign, 16> RVLocs2;
02199     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
02200                     *DAG.getContext());
02201     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
02202 
02203     if (RVLocs1.size() != RVLocs2.size())
02204       return false;
02205     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
02206       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
02207         return false;
02208       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
02209         return false;
02210       if (RVLocs1[i].isRegLoc()) {
02211         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
02212           return false;
02213       } else {
02214         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
02215           return false;
02216       }
02217     }
02218   }
02219 
02220   // Nothing more to check if the callee is taking no arguments
02221   if (Outs.empty())
02222     return true;
02223 
02224   SmallVector<CCValAssign, 16> ArgLocs;
02225   CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
02226                  *DAG.getContext());
02227 
02228   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
02229 
02230   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
02231 
02232   // If the stack arguments for this call would fit into our own save area then
02233   // the call can be made tail.
02234   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
02235 }
02236 
02237 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
02238                                                    SelectionDAG &DAG,
02239                                                    MachineFrameInfo *MFI,
02240                                                    int ClobberedFI) const {
02241   SmallVector<SDValue, 8> ArgChains;
02242   int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
02243   int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
02244 
02245   // Include the original chain at the beginning of the list. When this is
02246   // used by target LowerCall hooks, this helps legalize find the
02247   // CALLSEQ_BEGIN node.
02248   ArgChains.push_back(Chain);
02249 
02250   // Add a chain value for each stack argument corresponding
02251   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
02252                             UE = DAG.getEntryNode().getNode()->use_end();
02253        U != UE; ++U)
02254     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
02255       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
02256         if (FI->getIndex() < 0) {
02257           int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
02258           int64_t InLastByte = InFirstByte;
02259           InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
02260 
02261           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
02262               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
02263             ArgChains.push_back(SDValue(L, 1));
02264         }
02265 
02266   // Build a tokenfactor for all the chains.
02267   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
02268 }
02269 
02270 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
02271                                                    bool TailCallOpt) const {
02272   return CallCC == CallingConv::Fast && TailCallOpt;
02273 }
02274 
02275 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
02276   return CallCC == CallingConv::Fast;
02277 }
02278 
02279 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
02280 /// and add input and output parameter nodes.
02281 SDValue
02282 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
02283                                  SmallVectorImpl<SDValue> &InVals) const {
02284   SelectionDAG &DAG = CLI.DAG;
02285   SDLoc &DL = CLI.DL;
02286   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
02287   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
02288   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
02289   SDValue Chain = CLI.Chain;
02290   SDValue Callee = CLI.Callee;
02291   bool &IsTailCall = CLI.IsTailCall;
02292   CallingConv::ID CallConv = CLI.CallConv;
02293   bool IsVarArg = CLI.IsVarArg;
02294 
02295   MachineFunction &MF = DAG.getMachineFunction();
02296   bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
02297   bool IsThisReturn = false;
02298 
02299   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
02300   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
02301   bool IsSibCall = false;
02302 
02303   if (IsTailCall) {
02304     // Check if it's really possible to do a tail call.
02305     IsTailCall = isEligibleForTailCallOptimization(
02306         Callee, CallConv, IsVarArg, IsStructRet,
02307         MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
02308     if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
02309       report_fatal_error("failed to perform tail call elimination on a call "
02310                          "site marked musttail");
02311 
02312     // A sibling call is one where we're under the usual C ABI and not planning
02313     // to change that but can still do a tail call:
02314     if (!TailCallOpt && IsTailCall)
02315       IsSibCall = true;
02316 
02317     if (IsTailCall)
02318       ++NumTailCalls;
02319   }
02320 
02321   // Analyze operands of the call, assigning locations to each operand.
02322   SmallVector<CCValAssign, 16> ArgLocs;
02323   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
02324                  *DAG.getContext());
02325 
02326   if (IsVarArg) {
02327     // Handle fixed and variable vector arguments differently.
02328     // Variable vector arguments always go into memory.
02329     unsigned NumArgs = Outs.size();
02330 
02331     for (unsigned i = 0; i != NumArgs; ++i) {
02332       MVT ArgVT = Outs[i].VT;
02333       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
02334       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
02335                                                /*IsVarArg=*/ !Outs[i].IsFixed);
02336       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
02337       assert(!Res && "Call operand has unhandled type");
02338       (void)Res;
02339     }
02340   } else {
02341     // At this point, Outs[].VT may already be promoted to i32. To correctly
02342     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
02343     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
02344     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
02345     // we use a special version of AnalyzeCallOperands to pass in ValVT and
02346     // LocVT.
02347     unsigned NumArgs = Outs.size();
02348     for (unsigned i = 0; i != NumArgs; ++i) {
02349       MVT ValVT = Outs[i].VT;
02350       // Get type of the original argument.
02351       EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
02352                                   /*AllowUnknown*/ true);
02353       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
02354       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
02355       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
02356       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
02357         ValVT = MVT::i8;
02358       else if (ActualMVT == MVT::i16)
02359         ValVT = MVT::i16;
02360 
02361       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
02362       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
02363       assert(!Res && "Call operand has unhandled type");
02364       (void)Res;
02365     }
02366   }
02367 
02368   // Get a count of how many bytes are to be pushed on the stack.
02369   unsigned NumBytes = CCInfo.getNextStackOffset();
02370 
02371   if (IsSibCall) {
02372     // Since we're not changing the ABI to make this a tail call, the memory
02373     // operands are already available in the caller's incoming argument space.
02374     NumBytes = 0;
02375   }
02376 
02377   // FPDiff is the byte offset of the call's argument area from the callee's.
02378   // Stores to callee stack arguments will be placed in FixedStackSlots offset
02379   // by this amount for a tail call. In a sibling call it must be 0 because the
02380   // caller will deallocate the entire stack and the callee still expects its
02381   // arguments to begin at SP+0. Completely unused for non-tail calls.
02382   int FPDiff = 0;
02383 
02384   if (IsTailCall && !IsSibCall) {
02385     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
02386 
02387     // Since callee will pop argument stack as a tail call, we must keep the
02388     // popped size 16-byte aligned.
02389     NumBytes = RoundUpToAlignment(NumBytes, 16);
02390 
02391     // FPDiff will be negative if this tail call requires more space than we
02392     // would automatically have in our incoming argument space. Positive if we
02393     // can actually shrink the stack.
02394     FPDiff = NumReusableBytes - NumBytes;
02395 
02396     // The stack pointer must be 16-byte aligned at all times it's used for a
02397     // memory operation, which in practice means at *all* times and in
02398     // particular across call boundaries. Therefore our own arguments started at
02399     // a 16-byte aligned SP and the delta applied for the tail call should
02400     // satisfy the same constraint.
02401     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
02402   }
02403 
02404   // Adjust the stack pointer for the new arguments...
02405   // These operations are automatically eliminated by the prolog/epilog pass
02406   if (!IsSibCall)
02407     Chain =
02408         DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
02409 
02410   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
02411 
02412   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02413   SmallVector<SDValue, 8> MemOpChains;
02414 
02415   // Walk the register/memloc assignments, inserting copies/loads.
02416   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
02417        ++i, ++realArgIdx) {
02418     CCValAssign &VA = ArgLocs[i];
02419     SDValue Arg = OutVals[realArgIdx];
02420     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
02421 
02422     // Promote the value if needed.
02423     switch (VA.getLocInfo()) {
02424     default:
02425       llvm_unreachable("Unknown loc info!");
02426     case CCValAssign::Full:
02427       break;
02428     case CCValAssign::SExt:
02429       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
02430       break;
02431     case CCValAssign::ZExt:
02432       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
02433       break;
02434     case CCValAssign::AExt:
02435       if (Outs[realArgIdx].ArgVT == MVT::i1) {
02436         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
02437         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
02438         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
02439       }
02440       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
02441       break;
02442     case CCValAssign::BCvt:
02443       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
02444       break;
02445     case CCValAssign::FPExt:
02446       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
02447       break;
02448     }
02449 
02450     if (VA.isRegLoc()) {
02451       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
02452         assert(VA.getLocVT() == MVT::i64 &&
02453                "unexpected calling convention register assignment");
02454         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
02455                "unexpected use of 'returned'");
02456         IsThisReturn = true;
02457       }
02458       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02459     } else {
02460       assert(VA.isMemLoc());
02461 
02462       SDValue DstAddr;
02463       MachinePointerInfo DstInfo;
02464 
02465       // FIXME: This works on big-endian for composite byvals, which are the
02466       // common case. It should also work for fundamental types too.
02467       uint32_t BEAlign = 0;
02468       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
02469                                         : VA.getValVT().getSizeInBits();
02470       OpSize = (OpSize + 7) / 8;
02471       if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
02472         if (OpSize < 8)
02473           BEAlign = 8 - OpSize;
02474       }
02475       unsigned LocMemOffset = VA.getLocMemOffset();
02476       int32_t Offset = LocMemOffset + BEAlign;
02477       SDValue PtrOff = DAG.getIntPtrConstant(Offset);
02478       PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
02479 
02480       if (IsTailCall) {
02481         Offset = Offset + FPDiff;
02482         int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02483 
02484         DstAddr = DAG.getFrameIndex(FI, getPointerTy());
02485         DstInfo = MachinePointerInfo::getFixedStack(FI);
02486 
02487         // Make sure any stack arguments overlapping with where we're storing
02488         // are loaded before this eventual operation. Otherwise they'll be
02489         // clobbered.
02490         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
02491       } else {
02492         SDValue PtrOff = DAG.getIntPtrConstant(Offset);
02493 
02494         DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
02495         DstInfo = MachinePointerInfo::getStack(LocMemOffset);
02496       }
02497 
02498       if (Outs[i].Flags.isByVal()) {
02499         SDValue SizeNode =
02500             DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
02501         SDValue Cpy = DAG.getMemcpy(
02502             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
02503             /*isVol = */ false,
02504             /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
02505 
02506         MemOpChains.push_back(Cpy);
02507       } else {
02508         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
02509         // promoted to a legal register type i32, we should truncate Arg back to
02510         // i1/i8/i16.
02511         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
02512             VA.getValVT() == MVT::i16)
02513           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
02514 
02515         SDValue Store =
02516             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
02517         MemOpChains.push_back(Store);
02518       }
02519     }
02520   }
02521 
02522   if (!MemOpChains.empty())
02523     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
02524 
02525   // Build a sequence of copy-to-reg nodes chained together with token chain
02526   // and flag operands which copy the outgoing args into the appropriate regs.
02527   SDValue InFlag;
02528   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02529     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
02530                              RegsToPass[i].second, InFlag);
02531     InFlag = Chain.getValue(1);
02532   }
02533 
02534   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
02535   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
02536   // node so that legalize doesn't hack it.
02537   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
02538       Subtarget->isTargetMachO()) {
02539     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02540       const GlobalValue *GV = G->getGlobal();
02541       bool InternalLinkage = GV->hasInternalLinkage();
02542       if (InternalLinkage)
02543         Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
02544       else {
02545         Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
02546                                             AArch64II::MO_GOT);
02547         Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
02548       }
02549     } else if (ExternalSymbolSDNode *S =
02550                    dyn_cast<ExternalSymbolSDNode>(Callee)) {
02551       const char *Sym = S->getSymbol();
02552       Callee =
02553           DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
02554       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
02555     }
02556   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
02557     const GlobalValue *GV = G->getGlobal();
02558     Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
02559   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
02560     const char *Sym = S->getSymbol();
02561     Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
02562   }
02563 
02564   // We don't usually want to end the call-sequence here because we would tidy
02565   // the frame up *after* the call, however in the ABI-changing tail-call case
02566   // we've carefully laid out the parameters so that when sp is reset they'll be
02567   // in the correct location.
02568   if (IsTailCall && !IsSibCall) {
02569     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
02570                                DAG.getIntPtrConstant(0, true), InFlag, DL);
02571     InFlag = Chain.getValue(1);
02572   }
02573 
02574   std::vector<SDValue> Ops;
02575   Ops.push_back(Chain);
02576   Ops.push_back(Callee);
02577 
02578   if (IsTailCall) {
02579     // Each tail call may have to adjust the stack by a different amount, so
02580     // this information must travel along with the operation for eventual
02581     // consumption by emitEpilogue.
02582     Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
02583   }
02584 
02585   // Add argument registers to the end of the list so that they are known live
02586   // into the call.
02587   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
02588     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
02589                                   RegsToPass[i].second.getValueType()));
02590 
02591   // Add a register mask operand representing the call-preserved registers.
02592   const uint32_t *Mask;
02593   const TargetRegisterInfo *TRI =
02594       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
02595   const AArch64RegisterInfo *ARI =
02596       static_cast<const AArch64RegisterInfo *>(TRI);
02597   if (IsThisReturn) {
02598     // For 'this' returns, use the X0-preserving mask if applicable
02599     Mask = ARI->getThisReturnPreservedMask(CallConv);
02600     if (!Mask) {
02601       IsThisReturn = false;
02602       Mask = ARI->getCallPreservedMask(CallConv);
02603     }
02604   } else
02605     Mask = ARI->getCallPreservedMask(CallConv);
02606 
02607   assert(Mask && "Missing call preserved mask for calling convention");
02608   Ops.push_back(DAG.getRegisterMask(Mask));
02609 
02610   if (InFlag.getNode())
02611     Ops.push_back(InFlag);
02612 
02613   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
02614 
02615   // If we're doing a tall call, use a TC_RETURN here rather than an
02616   // actual call instruction.
02617   if (IsTailCall)
02618     return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
02619 
02620   // Returns a chain and a flag for retval copy to use.
02621   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
02622   InFlag = Chain.getValue(1);
02623 
02624   uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
02625                                 ? RoundUpToAlignment(NumBytes, 16)
02626                                 : 0;
02627 
02628   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
02629                              DAG.getIntPtrConstant(CalleePopBytes, true),
02630                              InFlag, DL);
02631   if (!Ins.empty())
02632     InFlag = Chain.getValue(1);
02633 
02634   // Handle result values, copying them out of physregs into vregs that we
02635   // return.
02636   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
02637                          InVals, IsThisReturn,
02638                          IsThisReturn ? OutVals[0] : SDValue());
02639 }
02640 
02641 bool AArch64TargetLowering::CanLowerReturn(
02642     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
02643     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
02644   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
02645                           ? RetCC_AArch64_WebKit_JS
02646                           : RetCC_AArch64_AAPCS;
02647   SmallVector<CCValAssign, 16> RVLocs;
02648   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
02649   return CCInfo.CheckReturn(Outs, RetCC);
02650 }
02651 
02652 SDValue
02653 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
02654                                    bool isVarArg,
02655                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
02656                                    const SmallVectorImpl<SDValue> &OutVals,
02657                                    SDLoc DL, SelectionDAG &DAG) const {
02658   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
02659                           ? RetCC_AArch64_WebKit_JS
02660                           : RetCC_AArch64_AAPCS;
02661   SmallVector<CCValAssign, 16> RVLocs;
02662   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02663                  *DAG.getContext());
02664   CCInfo.AnalyzeReturn(Outs, RetCC);
02665 
02666   // Copy the result values into the output registers.
02667   SDValue Flag;
02668   SmallVector<SDValue, 4> RetOps(1, Chain);
02669   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
02670        ++i, ++realRVLocIdx) {
02671     CCValAssign &VA = RVLocs[i];
02672     assert(VA.isRegLoc() && "Can only return in registers!");
02673     SDValue Arg = OutVals[realRVLocIdx];
02674 
02675     switch (VA.getLocInfo()) {
02676     default:
02677       llvm_unreachable("Unknown loc info!");
02678     case CCValAssign::Full:
02679       if (Outs[i].ArgVT == MVT::i1) {
02680         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
02681         // value. This is strictly redundant on Darwin (which uses "zeroext
02682         // i1"), but will be optimised out before ISel.
02683         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
02684         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
02685       }
02686       break;
02687     case CCValAssign::BCvt:
02688       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
02689       break;
02690     }
02691 
02692     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
02693     Flag = Chain.getValue(1);
02694     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02695   }
02696 
02697   RetOps[0] = Chain; // Update chain.
02698 
02699   // Add the flag if we have it.
02700   if (Flag.getNode())
02701     RetOps.push_back(Flag);
02702 
02703   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
02704 }
02705 
02706 //===----------------------------------------------------------------------===//
02707 //  Other Lowering Code
02708 //===----------------------------------------------------------------------===//
02709 
02710 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
02711                                                   SelectionDAG &DAG) const {
02712   EVT PtrVT = getPointerTy();
02713   SDLoc DL(Op);
02714   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
02715   const GlobalValue *GV = GN->getGlobal();
02716   unsigned char OpFlags =
02717       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
02718 
02719   assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
02720          "unexpected offset in global node");
02721 
02722   // This also catched the large code model case for Darwin.
02723   if ((OpFlags & AArch64II::MO_GOT) != 0) {
02724     SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
02725     // FIXME: Once remat is capable of dealing with instructions with register
02726     // operands, expand this into two nodes instead of using a wrapper node.
02727     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
02728   }
02729 
02730   if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
02731     assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
02732            "use of MO_CONSTPOOL only supported on small model");
02733     SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
02734     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
02735     unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
02736     SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
02737     SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
02738     SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
02739                                      MachinePointerInfo::getConstantPool(),
02740                                      /*isVolatile=*/ false,
02741                                      /*isNonTemporal=*/ true,
02742                                      /*isInvariant=*/ true, 8);
02743     if (GN->getOffset() != 0)
02744       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
02745                          DAG.getConstant(GN->getOffset(), PtrVT));
02746     return GlobalAddr;
02747   }
02748 
02749   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
02750     const unsigned char MO_NC = AArch64II::MO_NC;
02751     return DAG.getNode(
02752         AArch64ISD::WrapperLarge, DL, PtrVT,
02753         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
02754         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
02755         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
02756         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
02757   } else {
02758     // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
02759     // the only correct model on Darwin.
02760     SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
02761                                             OpFlags | AArch64II::MO_PAGE);
02762     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
02763     SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
02764 
02765     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
02766     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
02767   }
02768 }
02769 
02770 /// \brief Convert a TLS address reference into the correct sequence of loads
02771 /// and calls to compute the variable's address (for Darwin, currently) and
02772 /// return an SDValue containing the final node.
02773 
02774 /// Darwin only has one TLS scheme which must be capable of dealing with the
02775 /// fully general situation, in the worst case. This means:
02776 ///     + "extern __thread" declaration.
02777 ///     + Defined in a possibly unknown dynamic library.
02778 ///
02779 /// The general system is that each __thread variable has a [3 x i64] descriptor
02780 /// which contains information used by the runtime to calculate the address. The
02781 /// only part of this the compiler needs to know about is the first xword, which
02782 /// contains a function pointer that must be called with the address of the
02783 /// entire descriptor in "x0".
02784 ///
02785 /// Since this descriptor may be in a different unit, in general even the
02786 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
02787 /// is:
02788 ///     adrp x0, _var@TLVPPAGE
02789 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
02790 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
02791 ///                                      ; the function pointer
02792 ///     blr x1                           ; Uses descriptor address in x0
02793 ///     ; Address of _var is now in x0.
02794 ///
02795 /// If the address of _var's descriptor *is* known to the linker, then it can
02796 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
02797 /// a slight efficiency gain.
02798 SDValue
02799 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
02800                                                    SelectionDAG &DAG) const {
02801   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
02802 
02803   SDLoc DL(Op);
02804   MVT PtrVT = getPointerTy();
02805   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
02806 
02807   SDValue TLVPAddr =
02808       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
02809   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
02810 
02811   // The first entry in the descriptor is a function pointer that we must call
02812   // to obtain the address of the variable.
02813   SDValue Chain = DAG.getEntryNode();
02814   SDValue FuncTLVGet =
02815       DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
02816                   false, true, true, 8);
02817   Chain = FuncTLVGet.getValue(1);
02818 
02819   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
02820   MFI->setAdjustsStack(true);
02821 
02822   // TLS calls preserve all registers except those that absolutely must be
02823   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
02824   // silly).
02825   const TargetRegisterInfo *TRI =
02826       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
02827   const AArch64RegisterInfo *ARI =
02828       static_cast<const AArch64RegisterInfo *>(TRI);
02829   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
02830 
02831   // Finally, we can make the call. This is just a degenerate version of a
02832   // normal AArch64 call node: x0 takes the address of the descriptor, and
02833   // returns the address of the variable in this thread.
02834   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
02835   Chain =
02836       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
02837                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
02838                   DAG.getRegisterMask(Mask), Chain.getValue(1));
02839   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
02840 }
02841 
02842 /// When accessing thread-local variables under either the general-dynamic or
02843 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
02844 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
02845 /// is a function pointer to carry out the resolution. This function takes the
02846 /// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
02847 /// other registers (except LR, NZCV) are preserved.
02848 ///
02849 /// Thus, the ideal call sequence on AArch64 is:
02850 ///
02851 ///     adrp x0, :tlsdesc:thread_var
02852 ///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
02853 ///     add x0, x0, :tlsdesc_lo12:thread_var
02854 ///     .tlsdesccall thread_var
02855 ///     blr x8
02856 ///     (TPIDR_EL0 offset now in x0).
02857 ///
02858 /// The ".tlsdesccall" directive instructs the assembler to insert a particular
02859 /// relocation to help the linker relax this sequence if it turns out to be too
02860 /// conservative.
02861 ///
02862 /// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
02863 /// is harmless.
02864 SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
02865                                                    SDValue DescAddr, SDLoc DL,
02866                                                    SelectionDAG &DAG) const {
02867   EVT PtrVT = getPointerTy();
02868 
02869   // The function we need to call is simply the first entry in the GOT for this
02870   // descriptor, load it in preparation.
02871   SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
02872 
02873   // TLS calls preserve all registers except those that absolutely must be
02874   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
02875   // silly).
02876   const TargetRegisterInfo *TRI =
02877       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
02878   const AArch64RegisterInfo *ARI =
02879       static_cast<const AArch64RegisterInfo *>(TRI);
02880   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
02881 
02882   // The function takes only one argument: the address of the descriptor itself
02883   // in X0.
02884   SDValue Glue, Chain;
02885   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
02886   Glue = Chain.getValue(1);
02887 
02888   // We're now ready to populate the argument list, as with a normal call:
02889   SmallVector<SDValue, 6> Ops;
02890   Ops.push_back(Chain);
02891   Ops.push_back(Func);
02892   Ops.push_back(SymAddr);
02893   Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
02894   Ops.push_back(DAG.getRegisterMask(Mask));
02895   Ops.push_back(Glue);
02896 
02897   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
02898   Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
02899   Glue = Chain.getValue(1);
02900 
02901   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
02902 }
02903 
02904 SDValue
02905 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
02906                                                 SelectionDAG &DAG) const {
02907   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
02908   assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
02909          "ELF TLS only supported in small memory model");
02910   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
02911 
02912   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
02913 
02914   SDValue TPOff;
02915   EVT PtrVT = getPointerTy();
02916   SDLoc DL(Op);
02917   const GlobalValue *GV = GA->getGlobal();
02918 
02919   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
02920 
02921   if (Model == TLSModel::LocalExec) {
02922     SDValue HiVar = DAG.getTargetGlobalAddress(
02923         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
02924     SDValue LoVar = DAG.getTargetGlobalAddress(
02925         GV, DL, PtrVT, 0,
02926         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
02927 
02928     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
02929                                        DAG.getTargetConstant(16, MVT::i32)),
02930                     0);
02931     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
02932                                        DAG.getTargetConstant(0, MVT::i32)),
02933                     0);
02934   } else if (Model == TLSModel::InitialExec) {
02935     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
02936     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
02937   } else if (Model == TLSModel::LocalDynamic) {
02938     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
02939     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
02940     // the beginning of the module's TLS region, followed by a DTPREL offset
02941     // calculation.
02942 
02943     // These accesses will need deduplicating if there's more than one.
02944     AArch64FunctionInfo *MFI =
02945         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
02946     MFI->incNumLocalDynamicTLSAccesses();
02947 
02948     // Accesses used in this sequence go via the TLS descriptor which lives in
02949     // the GOT. Prepare an address we can use to handle this.
02950     SDValue HiDesc = DAG.getTargetExternalSymbol(
02951         "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
02952     SDValue LoDesc = DAG.getTargetExternalSymbol(
02953         "_TLS_MODULE_BASE_", PtrVT,
02954         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
02955 
02956     // First argument to the descriptor call is the address of the descriptor
02957     // itself.
02958     SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
02959     DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
02960 
02961     // The call needs a relocation too for linker relaxation. It doesn't make
02962     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
02963     // the address.
02964     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
02965                                                   AArch64II::MO_TLS);
02966 
02967     // Now we can calculate the offset from TPIDR_EL0 to this module's
02968     // thread-local area.
02969     TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
02970 
02971     // Now use :dtprel_whatever: operations to calculate this variable's offset
02972     // in its thread-storage area.
02973     SDValue HiVar = DAG.getTargetGlobalAddress(
02974         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
02975     SDValue LoVar = DAG.getTargetGlobalAddress(
02976         GV, DL, MVT::i64, 0,
02977         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
02978 
02979     SDValue DTPOff =
02980         SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
02981                                    DAG.getTargetConstant(16, MVT::i32)),
02982                 0);
02983     DTPOff =
02984         SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
02985                                    DAG.getTargetConstant(0, MVT::i32)),
02986                 0);
02987 
02988     TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
02989   } else if (Model == TLSModel::GeneralDynamic) {
02990     // Accesses used in this sequence go via the TLS descriptor which lives in
02991     // the GOT. Prepare an address we can use to handle this.
02992     SDValue HiDesc = DAG.getTargetGlobalAddress(
02993         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
02994     SDValue LoDesc = DAG.getTargetGlobalAddress(
02995         GV, DL, PtrVT, 0,
02996         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
02997 
02998     // First argument to the descriptor call is the address of the descriptor
02999     // itself.
03000     SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
03001     DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
03002 
03003     // The call needs a relocation too for linker relaxation. It doesn't make
03004     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
03005     // the address.
03006     SDValue SymAddr =
03007         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
03008 
03009     // Finally we can make a call to calculate the offset from tpidr_el0.
03010     TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
03011   } else
03012     llvm_unreachable("Unsupported ELF TLS access model");
03013 
03014   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
03015 }
03016 
03017 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
03018                                                      SelectionDAG &DAG) const {
03019   if (Subtarget->isTargetDarwin())
03020     return LowerDarwinGlobalTLSAddress(Op, DAG);
03021   else if (Subtarget->isTargetELF())
03022     return LowerELFGlobalTLSAddress(Op, DAG);
03023 
03024   llvm_unreachable("Unexpected platform trying to use TLS");
03025 }
03026 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
03027   SDValue Chain = Op.getOperand(0);
03028   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
03029   SDValue LHS = Op.getOperand(2);
03030   SDValue RHS = Op.getOperand(3);
03031   SDValue Dest = Op.getOperand(4);
03032   SDLoc dl(Op);
03033 
03034   // Handle f128 first, since lowering it will result in comparing the return
03035   // value of a libcall against zero, which is just what the rest of LowerBR_CC
03036   // is expecting to deal with.
03037   if (LHS.getValueType() == MVT::f128) {
03038     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
03039 
03040     // If softenSetCCOperands returned a scalar, we need to compare the result
03041     // against zero to select between true and false values.
03042     if (!RHS.getNode()) {
03043       RHS = DAG.getConstant(0, LHS.getValueType());
03044       CC = ISD::SETNE;
03045     }
03046   }
03047 
03048   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
03049   // instruction.
03050   unsigned Opc = LHS.getOpcode();
03051   if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
03052       cast<ConstantSDNode>(RHS)->isOne() &&
03053       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
03054        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
03055     assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
03056            "Unexpected condition code.");
03057     // Only lower legal XALUO ops.
03058     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
03059       return SDValue();
03060 
03061     // The actual operation with overflow check.
03062     AArch64CC::CondCode OFCC;
03063     SDValue Value, Overflow;
03064     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
03065 
03066     if (CC == ISD::SETNE)
03067       OFCC = getInvertedCondCode(OFCC);
03068     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
03069 
03070     return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
03071                        CCVal, Overflow);
03072   }
03073 
03074   if (LHS.getValueType().isInteger()) {
03075     assert((LHS.getValueType() == RHS.getValueType()) &&
03076            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
03077 
03078     // If the RHS of the comparison is zero, we can potentially fold this
03079     // to a specialized branch.
03080     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
03081     if (RHSC && RHSC->getZExtValue() == 0) {
03082       if (CC == ISD::SETEQ) {
03083         // See if we can use a TBZ to fold in an AND as well.
03084         // TBZ has a smaller branch displacement than CBZ.  If the offset is
03085         // out of bounds, a late MI-layer pass rewrites branches.
03086         // 403.gcc is an example that hits this case.
03087         if (LHS.getOpcode() == ISD::AND &&
03088             isa<ConstantSDNode>(LHS.getOperand(1)) &&
03089             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
03090           SDValue Test = LHS.getOperand(0);
03091           uint64_t Mask = LHS.getConstantOperandVal(1);
03092           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
03093                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
03094         }
03095 
03096         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
03097       } else if (CC == ISD::SETNE) {
03098         // See if we can use a TBZ to fold in an AND as well.
03099         // TBZ has a smaller branch displacement than CBZ.  If the offset is
03100         // out of bounds, a late MI-layer pass rewrites branches.
03101         // 403.gcc is an example that hits this case.
03102         if (LHS.getOpcode() == ISD::AND &&
03103             isa<ConstantSDNode>(LHS.getOperand(1)) &&
03104             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
03105           SDValue Test = LHS.getOperand(0);
03106           uint64_t Mask = LHS.getConstantOperandVal(1);
03107           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
03108                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
03109         }
03110 
03111         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
03112       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
03113         // Don't combine AND since emitComparison converts the AND to an ANDS
03114         // (a.k.a. TST) and the test in the test bit and branch instruction
03115         // becomes redundant.  This would also increase register pressure.
03116         uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
03117         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
03118                            DAG.getConstant(Mask, MVT::i64), Dest);
03119       }
03120     }
03121     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
03122         LHS.getOpcode() != ISD::AND) {
03123       // Don't combine AND since emitComparison converts the AND to an ANDS
03124       // (a.k.a. TST) and the test in the test bit and branch instruction
03125       // becomes redundant.  This would also increase register pressure.
03126       uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
03127       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
03128                          DAG.getConstant(Mask, MVT::i64), Dest);
03129     }
03130 
03131     SDValue CCVal;
03132     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
03133     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
03134                        Cmp);
03135   }
03136 
03137   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
03138 
03139   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
03140   // clean.  Some of them require two branches to implement.
03141   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
03142   AArch64CC::CondCode CC1, CC2;
03143   changeFPCCToAArch64CC(CC, CC1, CC2);
03144   SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
03145   SDValue BR1 =
03146       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
03147   if (CC2 != AArch64CC::AL) {
03148     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
03149     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
03150                        Cmp);
03151   }
03152 
03153   return BR1;
03154 }
03155 
03156 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
03157                                               SelectionDAG &DAG) const {
03158   EVT VT = Op.getValueType();
03159   SDLoc DL(Op);
03160 
03161   SDValue In1 = Op.getOperand(0);
03162   SDValue In2 = Op.getOperand(1);
03163   EVT SrcVT = In2.getValueType();
03164   if (SrcVT != VT) {
03165     if (SrcVT == MVT::f32 && VT == MVT::f64)
03166       In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
03167     else if (SrcVT == MVT::f64 && VT == MVT::f32)
03168       In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
03169     else
03170       // FIXME: Src type is different, bail out for now. Can VT really be a
03171       // vector type?
03172       return SDValue();
03173   }
03174 
03175   EVT VecVT;
03176   EVT EltVT;
03177   SDValue EltMask, VecVal1, VecVal2;
03178   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
03179     EltVT = MVT::i32;
03180     VecVT = MVT::v4i32;
03181     EltMask = DAG.getConstant(0x80000000ULL, EltVT);
03182 
03183     if (!VT.isVector()) {
03184       VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
03185                                           DAG.getUNDEF(VecVT), In1);
03186       VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
03187                                           DAG.getUNDEF(VecVT), In2);
03188     } else {
03189       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
03190       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
03191     }
03192   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
03193     EltVT = MVT::i64;
03194     VecVT = MVT::v2i64;
03195 
03196     // We want to materialize a mask with the the high bit set, but the AdvSIMD
03197     // immediate moves cannot materialize that in a single instruction for
03198     // 64-bit elements. Instead, materialize zero and then negate it.
03199     EltMask = DAG.getConstant(0, EltVT);
03200 
03201     if (!VT.isVector()) {
03202       VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
03203                                           DAG.getUNDEF(VecVT), In1);
03204       VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
03205                                           DAG.getUNDEF(VecVT), In2);
03206     } else {
03207       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
03208       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
03209     }
03210   } else {
03211     llvm_unreachable("Invalid type for copysign!");
03212   }
03213 
03214   std::vector<SDValue> BuildVectorOps;
03215   for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
03216     BuildVectorOps.push_back(EltMask);
03217 
03218   SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
03219 
03220   // If we couldn't materialize the mask above, then the mask vector will be
03221   // the zero vector, and we need to negate it here.
03222   if (VT == MVT::f64 || VT == MVT::v2f64) {
03223     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
03224     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
03225     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
03226   }
03227 
03228   SDValue Sel =
03229       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
03230 
03231   if (VT == MVT::f32)
03232     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
03233   else if (VT == MVT::f64)
03234     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
03235   else
03236     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
03237 }
03238 
03239 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
03240   if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
03241           AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
03242     return SDValue();
03243 
03244   // While there is no integer popcount instruction, it can
03245   // be more efficiently lowered to the following sequence that uses
03246   // AdvSIMD registers/instructions as long as the copies to/from
03247   // the AdvSIMD registers are cheap.
03248   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
03249   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
03250   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
03251   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
03252   SDValue Val = Op.getOperand(0);
03253   SDLoc DL(Op);
03254   EVT VT = Op.getValueType();
03255   SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
03256 
03257   SDValue VecVal;
03258   if (VT == MVT::i32) {
03259     VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
03260     VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
03261                                        VecVal);
03262   } else {
03263     VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
03264   }
03265 
03266   SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
03267   SDValue UaddLV = DAG.getNode(
03268       ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
03269       DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
03270 
03271   if (VT == MVT::i64)
03272     UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
03273   return UaddLV;
03274 }
03275 
03276 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
03277 
03278   if (Op.getValueType().isVector())
03279     return LowerVSETCC(Op, DAG);
03280 
03281   SDValue LHS = Op.getOperand(0);
03282   SDValue RHS = Op.getOperand(1);
03283   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
03284   SDLoc dl(Op);
03285 
03286   // We chose ZeroOrOneBooleanContents, so use zero and one.
03287   EVT VT = Op.getValueType();
03288   SDValue TVal = DAG.getConstant(1, VT);
03289   SDValue FVal = DAG.getConstant(0, VT);
03290 
03291   // Handle f128 first, since one possible outcome is a normal integer
03292   // comparison which gets picked up by the next if statement.
03293   if (LHS.getValueType() == MVT::f128) {
03294     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
03295 
03296     // If softenSetCCOperands returned a scalar, use it.
03297     if (!RHS.getNode()) {
03298       assert(LHS.getValueType() == Op.getValueType() &&
03299              "Unexpected setcc expansion!");
03300       return LHS;
03301     }
03302   }
03303 
03304   if (LHS.getValueType().isInteger()) {
03305     SDValue CCVal;
03306     SDValue Cmp =
03307         getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
03308 
03309     // Note that we inverted the condition above, so we reverse the order of
03310     // the true and false operands here.  This will allow the setcc to be
03311     // matched to a single CSINC instruction.
03312     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
03313   }
03314 
03315   // Now we know we're dealing with FP values.
03316   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
03317 
03318   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
03319   // and do the comparison.
03320   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
03321 
03322   AArch64CC::CondCode CC1, CC2;
03323   changeFPCCToAArch64CC(CC, CC1, CC2);
03324   if (CC2 == AArch64CC::AL) {
03325     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
03326     SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
03327 
03328     // Note that we inverted the condition above, so we reverse the order of
03329     // the true and false operands here.  This will allow the setcc to be
03330     // matched to a single CSINC instruction.
03331     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
03332   } else {
03333     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
03334     // totally clean.  Some of them require two CSELs to implement.  As is in
03335     // this case, we emit the first CSEL and then emit a second using the output
03336     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
03337 
03338     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
03339     SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
03340     SDValue CS1 =
03341         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
03342 
03343     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
03344     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
03345   }
03346 }
03347 
03348 /// A SELECT_CC operation is really some kind of max or min if both values being
03349 /// compared are, in some sense, equal to the results in either case. However,
03350 /// it is permissible to compare f32 values and produce directly extended f64
03351 /// values.
03352 ///
03353 /// Extending the comparison operands would also be allowed, but is less likely
03354 /// to happen in practice since their use is right here. Note that truncate
03355 /// operations would *not* be semantically equivalent.
03356 static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
03357   if (Cmp == Result)
03358     return true;
03359 
03360   ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
03361   ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
03362   if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
03363       Result.getValueType() == MVT::f64) {
03364     bool Lossy;
03365     APFloat CmpVal = CCmp->getValueAPF();
03366     CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
03367     return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
03368   }
03369 
03370   return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
03371 }
03372 
03373 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
03374                                            SelectionDAG &DAG) const {
03375   SDValue CC = Op->getOperand(0);
03376   SDValue TVal = Op->getOperand(1);
03377   SDValue FVal = Op->getOperand(2);
03378   SDLoc DL(Op);
03379 
03380   unsigned Opc = CC.getOpcode();
03381   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
03382   // instruction.
03383   if (CC.getResNo() == 1 &&
03384       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
03385        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
03386     // Only lower legal XALUO ops.
03387     if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
03388       return SDValue();
03389 
03390     AArch64CC::CondCode OFCC;
03391     SDValue Value, Overflow;
03392     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
03393     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
03394 
03395     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
03396                        CCVal, Overflow);
03397   }
03398 
03399   if (CC.getOpcode() == ISD::SETCC)
03400     return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
03401                            cast<CondCodeSDNode>(CC.getOperand(2))->get());
03402   else
03403     return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
03404                            FVal, ISD::SETNE);
03405 }
03406 
03407 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
03408                                               SelectionDAG &DAG) const {
03409   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
03410   SDValue LHS = Op.getOperand(0);
03411   SDValue RHS = Op.getOperand(1);
03412   SDValue TVal = Op.getOperand(2);
03413   SDValue FVal = Op.getOperand(3);
03414   SDLoc dl(Op);
03415 
03416   // Handle f128 first, because it will result in a comparison of some RTLIB
03417   // call result against zero.
03418   if (LHS.getValueType() == MVT::f128) {
03419     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
03420 
03421     // If softenSetCCOperands returned a scalar, we need to compare the result
03422     // against zero to select between true and false values.
03423     if (!RHS.getNode()) {
03424       RHS = DAG.getConstant(0, LHS.getValueType());
03425       CC = ISD::SETNE;
03426     }
03427   }
03428 
03429   // Handle integers first.
03430   if (LHS.getValueType().isInteger()) {
03431     assert((LHS.getValueType() == RHS.getValueType()) &&
03432            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
03433 
03434     unsigned Opcode = AArch64ISD::CSEL;
03435 
03436     // If both the TVal and the FVal are constants, see if we can swap them in
03437     // order to for a CSINV or CSINC out of them.
03438     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
03439     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
03440 
03441     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
03442       std::swap(TVal, FVal);
03443       std::swap(CTVal, CFVal);
03444       CC = ISD::getSetCCInverse(CC, true);
03445     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
03446       std::swap(TVal, FVal);
03447       std::swap(CTVal, CFVal);
03448       CC = ISD::getSetCCInverse(CC, true);
03449     } else if (TVal.getOpcode() == ISD::XOR) {
03450       // If TVal is a NOT we want to swap TVal and FVal so that we can match
03451       // with a CSINV rather than a CSEL.
03452       ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
03453 
03454       if (CVal && CVal->isAllOnesValue()) {
03455         std::swap(TVal, FVal);
03456         std::swap(CTVal, CFVal);
03457         CC = ISD::getSetCCInverse(CC, true);
03458       }
03459     } else if (TVal.getOpcode() == ISD::SUB) {
03460       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
03461       // that we can match with a CSNEG rather than a CSEL.
03462       ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
03463 
03464       if (CVal && CVal->isNullValue()) {
03465         std::swap(TVal, FVal);
03466         std::swap(CTVal, CFVal);
03467         CC = ISD::getSetCCInverse(CC, true);
03468       }
03469     } else if (CTVal && CFVal) {
03470       const int64_t TrueVal = CTVal->getSExtValue();
03471       const int64_t FalseVal = CFVal->getSExtValue();
03472       bool Swap = false;
03473 
03474       // If both TVal and FVal are constants, see if FVal is the
03475       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
03476       // instead of a CSEL in that case.
03477       if (TrueVal == ~FalseVal) {
03478         Opcode = AArch64ISD::CSINV;
03479       } else if (TrueVal == -FalseVal) {
03480         Opcode = AArch64ISD::CSNEG;
03481       } else if (TVal.getValueType() == MVT::i32) {
03482         // If our operands are only 32-bit wide, make sure we use 32-bit
03483         // arithmetic for the check whether we can use CSINC. This ensures that
03484         // the addition in the check will wrap around properly in case there is
03485         // an overflow (which would not be the case if we do the check with
03486         // 64-bit arithmetic).
03487         const uint32_t TrueVal32 = CTVal->getZExtValue();
03488         const uint32_t FalseVal32 = CFVal->getZExtValue();
03489 
03490         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
03491           Opcode = AArch64ISD::CSINC;
03492 
03493           if (TrueVal32 > FalseVal32) {
03494             Swap = true;
03495           }
03496         }
03497         // 64-bit check whether we can use CSINC.
03498       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
03499         Opcode = AArch64ISD::CSINC;
03500 
03501         if (TrueVal > FalseVal) {
03502           Swap = true;
03503         }
03504       }
03505 
03506       // Swap TVal and FVal if necessary.
03507       if (Swap) {
03508         std::swap(TVal, FVal);
03509         std::swap(CTVal, CFVal);
03510         CC = ISD::getSetCCInverse(CC, true);
03511       }
03512 
03513       if (Opcode != AArch64ISD::CSEL) {
03514         // Drop FVal since we can get its value by simply inverting/negating
03515         // TVal.
03516         FVal = TVal;
03517       }
03518     }
03519 
03520     SDValue CCVal;
03521     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
03522 
03523     EVT VT = Op.getValueType();
03524     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
03525   }
03526 
03527   // Now we know we're dealing with FP values.
03528   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
03529   assert(LHS.getValueType() == RHS.getValueType());
03530   EVT VT = Op.getValueType();
03531 
03532   // Try to match this select into a max/min operation, which have dedicated
03533   // opcode in the instruction set.
03534   // FIXME: This is not correct in the presence of NaNs, so we only enable this
03535   // in no-NaNs mode.
03536   if (getTargetMachine().Options.NoNaNsFPMath) {
03537     SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
03538     if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
03539         selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
03540       CC = ISD::getSetCCSwappedOperands(CC);
03541       std::swap(MinMaxLHS, MinMaxRHS);
03542     }
03543 
03544     if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
03545         selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
03546       switch (CC) {
03547       default:
03548         break;
03549       case ISD::SETGT:
03550       case ISD::SETGE:
03551       case ISD::SETUGT:
03552       case ISD::SETUGE:
03553       case ISD::SETOGT:
03554       case ISD::SETOGE:
03555         return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
03556         break;
03557       case ISD::SETLT:
03558       case ISD::SETLE:
03559       case ISD::SETULT:
03560       case ISD::SETULE:
03561       case ISD::SETOLT:
03562       case ISD::SETOLE:
03563         return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
03564         break;
03565       }
03566     }
03567   }
03568 
03569   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
03570   // and do the comparison.
03571   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
03572 
03573   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
03574   // clean.  Some of them require two CSELs to implement.
03575   AArch64CC::CondCode CC1, CC2;
03576   changeFPCCToAArch64CC(CC, CC1, CC2);
03577   SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
03578   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
03579 
03580   // If we need a second CSEL, emit it, using the output of the first as the
03581   // RHS.  We're effectively OR'ing the two CC's together.
03582   if (CC2 != AArch64CC::AL) {
03583     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
03584     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
03585   }
03586 
03587   // Otherwise, return the output of the first CSEL.
03588   return CS1;
03589 }
03590 
03591 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
03592                                               SelectionDAG &DAG) const {
03593   // Jump table entries as PC relative offsets. No additional tweaking
03594   // is necessary here. Just get the address of the jump table.
03595   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
03596   EVT PtrVT = getPointerTy();
03597   SDLoc DL(Op);
03598 
03599   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
03600       !Subtarget->isTargetMachO()) {
03601     const unsigned char MO_NC = AArch64II::MO_NC;
03602     return DAG.getNode(
03603         AArch64ISD::WrapperLarge, DL, PtrVT,
03604         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
03605         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
03606         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
03607         DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
03608                                AArch64II::MO_G0 | MO_NC));
03609   }
03610 
03611   SDValue Hi =
03612       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
03613   SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
03614                                       AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
03615   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
03616   return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
03617 }
03618 
03619 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
03620                                                  SelectionDAG &DAG) const {
03621   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
03622   EVT PtrVT = getPointerTy();
03623   SDLoc DL(Op);
03624 
03625   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
03626     // Use the GOT for the large code model on iOS.
03627     if (Subtarget->isTargetMachO()) {
03628       SDValue GotAddr = DAG.getTargetConstantPool(
03629           CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
03630           AArch64II::MO_GOT);
03631       return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
03632     }
03633 
03634     const unsigned char MO_NC = AArch64II::MO_NC;
03635     return DAG.getNode(
03636         AArch64ISD::WrapperLarge, DL, PtrVT,
03637         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
03638                                   CP->getOffset(), AArch64II::MO_G3),
03639         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
03640                                   CP->getOffset(), AArch64II::MO_G2 | MO_NC),
03641         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
03642                                   CP->getOffset(), AArch64II::MO_G1 | MO_NC),
03643         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
03644                                   CP->getOffset(), AArch64II::MO_G0 | MO_NC));
03645   } else {
03646     // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
03647     // ELF, the only valid one on Darwin.
03648     SDValue Hi =
03649         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
03650                                   CP->getOffset(), AArch64II::MO_PAGE);
03651     SDValue Lo = DAG.getTargetConstantPool(
03652         CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
03653         AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
03654 
03655     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
03656     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
03657   }
03658 }
03659 
03660 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
03661                                                SelectionDAG &DAG) const {
03662   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
03663   EVT PtrVT = getPointerTy();
03664   SDLoc DL(Op);
03665   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
03666       !Subtarget->isTargetMachO()) {
03667     const unsigned char MO_NC = AArch64II::MO_NC;
03668     return DAG.getNode(
03669         AArch64ISD::WrapperLarge, DL, PtrVT,
03670         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
03671         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
03672         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
03673         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
03674   } else {
03675     SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
03676     SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
03677                                                              AArch64II::MO_NC);
03678     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
03679     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
03680   }
03681 }
03682 
03683 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
03684                                                  SelectionDAG &DAG) const {
03685   AArch64FunctionInfo *FuncInfo =
03686       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
03687 
03688   SDLoc DL(Op);
03689   SDValue FR =
03690       DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
03691   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
03692   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
03693                       MachinePointerInfo(SV), false, false, 0);
03694 }
03695 
03696 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
03697                                                 SelectionDAG &DAG) const {
03698   // The layout of the va_list struct is specified in the AArch64 Procedure Call
03699   // Standard, section B.3.
03700   MachineFunction &MF = DAG.getMachineFunction();
03701   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
03702   SDLoc DL(Op);
03703 
03704   SDValue Chain = Op.getOperand(0);
03705   SDValue VAList = Op.getOperand(1);
03706   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
03707   SmallVector<SDValue, 4> MemOps;
03708 
03709   // void *__stack at offset 0
03710   SDValue Stack =
03711       DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
03712   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
03713                                 MachinePointerInfo(SV), false, false, 8));
03714 
03715   // void *__gr_top at offset 8
03716   int GPRSize = FuncInfo->getVarArgsGPRSize();
03717   if (GPRSize > 0) {
03718     SDValue GRTop, GRTopAddr;
03719 
03720     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
03721                             DAG.getConstant(8, getPointerTy()));
03722 
03723     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
03724     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
03725                         DAG.getConstant(GPRSize, getPointerTy()));
03726 
03727     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
03728                                   MachinePointerInfo(SV, 8), false, false, 8));
03729   }
03730 
03731   // void *__vr_top at offset 16
03732   int FPRSize = FuncInfo->getVarArgsFPRSize();
03733   if (FPRSize > 0) {
03734     SDValue VRTop, VRTopAddr;
03735     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
03736                             DAG.getConstant(16, getPointerTy()));
03737 
03738     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
03739     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
03740                         DAG.getConstant(FPRSize, getPointerTy()));
03741 
03742     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
03743                                   MachinePointerInfo(SV, 16), false, false, 8));
03744   }
03745 
03746   // int __gr_offs at offset 24
03747   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
03748                                    DAG.getConstant(24, getPointerTy()));
03749   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
03750                                 GROffsAddr, MachinePointerInfo(SV, 24), false,
03751                                 false, 4));
03752 
03753   // int __vr_offs at offset 28
03754   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
03755                                    DAG.getConstant(28, getPointerTy()));
03756   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
03757                                 VROffsAddr, MachinePointerInfo(SV, 28), false,
03758                                 false, 4));
03759 
03760   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
03761 }
03762 
03763 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
03764                                             SelectionDAG &DAG) const {
03765   return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
03766                                      : LowerAAPCS_VASTART(Op, DAG);
03767 }
03768 
03769 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
03770                                            SelectionDAG &DAG) const {
03771   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
03772   // pointer.
03773   unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
03774   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
03775   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
03776 
03777   return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
03778                        Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
03779                        8, false, false, MachinePointerInfo(DestSV),
03780                        MachinePointerInfo(SrcSV));
03781 }
03782 
03783 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
03784   assert(Subtarget->isTargetDarwin() &&
03785          "automatic va_arg instruction only works on Darwin");
03786 
03787   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
03788   EVT VT = Op.getValueType();
03789   SDLoc DL(Op);
03790   SDValue Chain = Op.getOperand(0);
03791   SDValue Addr = Op.getOperand(1);
03792   unsigned Align = Op.getConstantOperandVal(3);
03793 
03794   SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
03795                                MachinePointerInfo(V), false, false, false, 0);
03796   Chain = VAList.getValue(1);
03797 
03798   if (Align > 8) {
03799     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
03800     VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
03801                          DAG.getConstant(Align - 1, getPointerTy()));
03802     VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
03803                          DAG.getConstant(-(int64_t)Align, getPointerTy()));
03804   }
03805 
03806   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
03807   uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
03808 
03809   // Scalar integer and FP values smaller than 64 bits are implicitly extended
03810   // up to 64 bits.  At the very least, we have to increase the striding of the
03811   // vaargs list to match this, and for FP values we need to introduce
03812   // FP_ROUND nodes as well.
03813   if (VT.isInteger() && !VT.isVector())
03814     ArgSize = 8;
03815   bool NeedFPTrunc = false;
03816   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
03817     ArgSize = 8;
03818     NeedFPTrunc = true;
03819   }
03820 
03821   // Increment the pointer, VAList, to the next vaarg
03822   SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
03823                                DAG.getConstant(ArgSize, getPointerTy()));
03824   // Store the incremented VAList to the legalized pointer
03825   SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
03826                                  false, false, 0);
03827 
03828   // Load the actual argument out of the pointer VAList
03829   if (NeedFPTrunc) {
03830     // Load the value as an f64.
03831     SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
03832                                  MachinePointerInfo(), false, false, false, 0);
03833     // Round the value down to an f32.
03834     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
03835                                    DAG.getIntPtrConstant(1));
03836     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
03837     // Merge the rounded value with the chain output of the load.
03838     return DAG.getMergeValues(Ops, DL);
03839   }
03840 
03841   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
03842                      false, false, 0);
03843 }
03844 
03845 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
03846                                               SelectionDAG &DAG) const {
03847   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
03848   MFI->setFrameAddressIsTaken(true);
03849 
03850   EVT VT = Op.getValueType();
03851   SDLoc DL(Op);
03852   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
03853   SDValue FrameAddr =
03854       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
03855   while (Depth--)
03856     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
03857                             MachinePointerInfo(), false, false, false, 0);
03858   return FrameAddr;
03859 }
03860 
03861 // FIXME? Maybe this could be a TableGen attribute on some registers and
03862 // this table could be generated automatically from RegInfo.
03863 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
03864                                                   EVT VT) const {
03865   unsigned Reg = StringSwitch<unsigned>(RegName)
03866                        .Case("sp", AArch64::SP)
03867                        .Default(0);
03868   if (Reg)
03869     return Reg;
03870   report_fatal_error("Invalid register name global variable");
03871 }
03872 
03873 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
03874                                                SelectionDAG &DAG) const {
03875   MachineFunction &MF = DAG.getMachineFunction();
03876   MachineFrameInfo *MFI = MF.getFrameInfo();
03877   MFI->setReturnAddressIsTaken(true);
03878 
03879   EVT VT = Op.getValueType();
03880   SDLoc DL(Op);
03881   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
03882   if (Depth) {
03883     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
03884     SDValue Offset = DAG.getConstant(8, getPointerTy());
03885     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
03886                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
03887                        MachinePointerInfo(), false, false, false, 0);
03888   }
03889 
03890   // Return LR, which contains the return address. Mark it an implicit live-in.
03891   unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
03892   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
03893 }
03894 
03895 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
03896 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
03897 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
03898                                                     SelectionDAG &DAG) const {
03899   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
03900   EVT VT = Op.getValueType();
03901   unsigned VTBits = VT.getSizeInBits();
03902   SDLoc dl(Op);
03903   SDValue ShOpLo = Op.getOperand(0);
03904   SDValue ShOpHi = Op.getOperand(1);
03905   SDValue ShAmt = Op.getOperand(2);
03906   SDValue ARMcc;
03907   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
03908 
03909   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
03910 
03911   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
03912                                  DAG.getConstant(VTBits, MVT::i64), ShAmt);
03913   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
03914   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
03915                                    DAG.getConstant(VTBits, MVT::i64));
03916   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
03917 
03918   SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
03919                                ISD::SETGE, dl, DAG);
03920   SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
03921 
03922   SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
03923   SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
03924   SDValue Lo =
03925       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
03926 
03927   // AArch64 shifts larger than the register width are wrapped rather than
03928   // clamped, so we can't just emit "hi >> x".
03929   SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
03930   SDValue TrueValHi = Opc == ISD::SRA
03931                           ? DAG.getNode(Opc, dl, VT, ShOpHi,
03932                                         DAG.getConstant(VTBits - 1, MVT::i64))
03933                           : DAG.getConstant(0, VT);
03934   SDValue Hi =
03935       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
03936 
03937   SDValue Ops[2] = { Lo, Hi };
03938   return DAG.getMergeValues(Ops, dl);
03939 }
03940 
03941 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
03942 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
03943 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
03944                                                  SelectionDAG &DAG) const {
03945   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
03946   EVT VT = Op.getValueType();
03947   unsigned VTBits = VT.getSizeInBits();
03948   SDLoc dl(Op);
03949   SDValue ShOpLo = Op.getOperand(0);
03950   SDValue ShOpHi = Op.getOperand(1);
03951   SDValue ShAmt = Op.getOperand(2);
03952   SDValue ARMcc;
03953 
03954   assert(Op.getOpcode() == ISD::SHL_PARTS);
03955   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
03956                                  DAG.getConstant(VTBits, MVT::i64), ShAmt);
03957   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
03958   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
03959                                    DAG.getConstant(VTBits, MVT::i64));
03960   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
03961   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
03962 
03963   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
03964 
03965   SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
03966                                ISD::SETGE, dl, DAG);
03967   SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
03968   SDValue Hi =
03969       DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
03970 
03971   // AArch64 shifts of larger than register sizes are wrapped rather than
03972   // clamped, so we can't just emit "lo << a" if a is too big.
03973   SDValue TrueValLo = DAG.getConstant(0, VT);
03974   SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
03975   SDValue Lo =
03976       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
03977 
03978   SDValue Ops[2] = { Lo, Hi };
03979   return DAG.getMergeValues(Ops, dl);
03980 }
03981 
03982 bool AArch64TargetLowering::isOffsetFoldingLegal(
03983     const GlobalAddressSDNode *GA) const {
03984   // The AArch64 target doesn't support folding offsets into global addresses.
03985   return false;
03986 }
03987 
03988 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03989   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
03990   // FIXME: We should be able to handle f128 as well with a clever lowering.
03991   if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
03992     return true;
03993 
03994   if (VT == MVT::f64)
03995     return AArch64_AM::getFP64Imm(Imm) != -1;
03996   else if (VT == MVT::f32)
03997     return AArch64_AM::getFP32Imm(Imm) != -1;
03998   return false;
03999 }
04000 
04001 //===----------------------------------------------------------------------===//
04002 //                          AArch64 Optimization Hooks
04003 //===----------------------------------------------------------------------===//
04004 
04005 //===----------------------------------------------------------------------===//
04006 //                          AArch64 Inline Assembly Support
04007 //===----------------------------------------------------------------------===//
04008 
04009 // Table of Constraints
04010 // TODO: This is the current set of constraints supported by ARM for the
04011 // compiler, not all of them may make sense, e.g. S may be difficult to support.
04012 //
04013 // r - A general register
04014 // w - An FP/SIMD register of some size in the range v0-v31
04015 // x - An FP/SIMD register of some size in the range v0-v15
04016 // I - Constant that can be used with an ADD instruction
04017 // J - Constant that can be used with a SUB instruction
04018 // K - Constant that can be used with a 32-bit logical instruction
04019 // L - Constant that can be used with a 64-bit logical instruction
04020 // M - Constant that can be used as a 32-bit MOV immediate
04021 // N - Constant that can be used as a 64-bit MOV immediate
04022 // Q - A memory reference with base register and no offset
04023 // S - A symbolic address
04024 // Y - Floating point constant zero
04025 // Z - Integer constant zero
04026 //
04027 //   Note that general register operands will be output using their 64-bit x
04028 // register name, whatever the size of the variable, unless the asm operand
04029 // is prefixed by the %w modifier. Floating-point and SIMD register operands
04030 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
04031 // %q modifier.
04032 
04033 /// getConstraintType - Given a constraint letter, return the type of
04034 /// constraint it is for this target.
04035 AArch64TargetLowering::ConstraintType
04036 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
04037   if (Constraint.size() == 1) {
04038     switch (Constraint[0]) {
04039     default:
04040       break;
04041     case 'z':
04042       return C_Other;
04043     case 'x':
04044     case 'w':
04045       return C_RegisterClass;
04046     // An address with a single base register. Due to the way we
04047     // currently handle addresses it is the same as 'r'.
04048     case 'Q':
04049       return C_Memory;
04050     }
04051   }
04052   return TargetLowering::getConstraintType(Constraint);
04053 }
04054 
04055 /// Examine constraint type and operand type and determine a weight value.
04056 /// This object must already have been set up with the operand type
04057 /// and the current alternative constraint selected.
04058 TargetLowering::ConstraintWeight
04059 AArch64TargetLowering::getSingleConstraintMatchWeight(
04060     AsmOperandInfo &info, const char *constraint) const {
04061   ConstraintWeight weight = CW_Invalid;
04062   Value *CallOperandVal = info.CallOperandVal;
04063   // If we don't have a value, we can't do a match,
04064   // but allow it at the lowest weight.
04065   if (!CallOperandVal)
04066     return CW_Default;
04067   Type *type = CallOperandVal->getType();
04068   // Look at the constraint type.
04069   switch (*constraint) {
04070   default:
04071     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
04072     break;
04073   case 'x':
04074   case 'w':
04075     if (type->isFloatingPointTy() || type->isVectorTy())
04076       weight = CW_Register;
04077     break;
04078   case 'z':
04079     weight = CW_Constant;
04080     break;
04081   }
04082   return weight;
04083 }
04084 
04085 std::pair<unsigned, const TargetRegisterClass *>
04086 AArch64TargetLowering::getRegForInlineAsmConstraint(
04087     const std::string &Constraint, MVT VT) const {
04088   if (Constraint.size() == 1) {
04089     switch (Constraint[0]) {
04090     case 'r':
04091       if (VT.getSizeInBits() == 64)
04092         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
04093       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
04094     case 'w':
04095       if (VT == MVT::f32)
04096         return std::make_pair(0U, &AArch64::FPR32RegClass);
04097       if (VT.getSizeInBits() == 64)
04098         return std::make_pair(0U, &AArch64::FPR64RegClass);
04099       if (VT.getSizeInBits() == 128)
04100         return std::make_pair(0U, &AArch64::FPR128RegClass);
04101       break;
04102     // The instructions that this constraint is designed for can
04103     // only take 128-bit registers so just use that regclass.
04104     case 'x':
04105       if (VT.getSizeInBits() == 128)
04106         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
04107       break;
04108     }
04109   }
04110   if (StringRef("{cc}").equals_lower(Constraint))
04111     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
04112 
04113   // Use the default implementation in TargetLowering to convert the register
04114   // constraint into a member of a register class.
04115   std::pair<unsigned, const TargetRegisterClass *> Res;
04116   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
04117 
04118   // Not found as a standard register?
04119   if (!Res.second) {
04120     unsigned Size = Constraint.size();
04121     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
04122         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
04123       const std::string Reg =
04124           std::string(&Constraint[2], &Constraint[Size - 1]);
04125       int RegNo = atoi(Reg.c_str());
04126       if (RegNo >= 0 && RegNo <= 31) {
04127         // v0 - v31 are aliases of q0 - q31.
04128         // By default we'll emit v0-v31 for this unless there's a modifier where
04129         // we'll emit the correct register as well.
04130         Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
04131         Res.second = &AArch64::FPR128RegClass;
04132       }
04133     }
04134   }
04135 
04136   return Res;
04137 }
04138 
04139 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
04140 /// vector.  If it is invalid, don't add anything to Ops.
04141 void AArch64TargetLowering::LowerAsmOperandForConstraint(
04142     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
04143     SelectionDAG &DAG) const {
04144   SDValue Result;
04145 
04146   // Currently only support length 1 constraints.
04147   if (Constraint.length() != 1)
04148     return;
04149 
04150   char ConstraintLetter = Constraint[0];
04151   switch (ConstraintLetter) {
04152   default:
04153     break;
04154 
04155   // This set of constraints deal with valid constants for various instructions.
04156   // Validate and return a target constant for them if we can.
04157   case 'z': {
04158     // 'z' maps to xzr or wzr so it needs an input of 0.
04159     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
04160     if (!C || C->getZExtValue() != 0)
04161       return;
04162 
04163     if (Op.getValueType() == MVT::i64)
04164       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
04165     else
04166       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
04167     break;
04168   }
04169 
04170   case 'I':
04171   case 'J':
04172   case 'K':
04173   case 'L':
04174   case 'M':
04175   case 'N':
04176     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
04177     if (!C)
04178       return;
04179 
04180     // Grab the value and do some validation.
04181     uint64_t CVal = C->getZExtValue();
04182     switch (ConstraintLetter) {
04183     // The I constraint applies only to simple ADD or SUB immediate operands:
04184     // i.e. 0 to 4095 with optional shift by 12
04185     // The J constraint applies only to ADD or SUB immediates that would be
04186     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
04187     // instruction [or vice versa], in other words -1 to -4095 with optional
04188     // left shift by 12.
04189     case 'I':
04190       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
04191         break;
04192       return;
04193     case 'J': {
04194       uint64_t NVal = -C->getSExtValue();
04195       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
04196         CVal = C->getSExtValue();
04197         break;
04198       }
04199       return;
04200     }
04201     // The K and L constraints apply *only* to logical immediates, including
04202     // what used to be the MOVI alias for ORR (though the MOVI alias has now
04203     // been removed and MOV should be used). So these constraints have to
04204     // distinguish between bit patterns that are valid 32-bit or 64-bit
04205     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
04206     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
04207     // versa.
04208     case 'K':
04209       if (AArch64_AM::isLogicalImmediate(CVal, 32))
04210         break;
04211       return;
04212     case 'L':
04213       if (AArch64_AM::isLogicalImmediate(CVal, 64))
04214         break;
04215       return;
04216     // The M and N constraints are a superset of K and L respectively, for use
04217     // with the MOV (immediate) alias. As well as the logical immediates they
04218     // also match 32 or 64-bit immediates that can be loaded either using a
04219     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
04220     // (M) or 64-bit 0x1234000000000000 (N) etc.
04221     // As a note some of this code is liberally stolen from the asm parser.
04222     case 'M': {
04223       if (!isUInt<32>(CVal))
04224         return;
04225       if (AArch64_AM::isLogicalImmediate(CVal, 32))
04226         break;
04227       if ((CVal & 0xFFFF) == CVal)
04228         break;
04229       if ((CVal & 0xFFFF0000ULL) == CVal)
04230         break;
04231       uint64_t NCVal = ~(uint32_t)CVal;
04232       if ((NCVal & 0xFFFFULL) == NCVal)
04233         break;
04234       if ((NCVal & 0xFFFF0000ULL) == NCVal)
04235         break;
04236       return;
04237     }
04238     case 'N': {
04239       if (AArch64_AM::isLogicalImmediate(CVal, 64))
04240         break;
04241       if ((CVal & 0xFFFFULL) == CVal)
04242         break;
04243       if ((CVal & 0xFFFF0000ULL) == CVal)
04244         break;
04245       if ((CVal & 0xFFFF00000000ULL) == CVal)
04246         break;
04247       if ((CVal & 0xFFFF000000000000ULL) == CVal)
04248         break;
04249       uint64_t NCVal = ~CVal;
04250       if ((NCVal & 0xFFFFULL) == NCVal)
04251         break;
04252       if ((NCVal & 0xFFFF0000ULL) == NCVal)
04253         break;
04254       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
04255         break;
04256       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
04257         break;
04258       return;
04259     }
04260     default:
04261       return;
04262     }
04263 
04264     // All assembler immediates are 64-bit integers.
04265     Result = DAG.getTargetConstant(CVal, MVT::i64);
04266     break;
04267   }
04268 
04269   if (Result.getNode()) {
04270     Ops.push_back(Result);
04271     return;
04272   }
04273 
04274   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
04275 }
04276 
04277 //===----------------------------------------------------------------------===//
04278 //                     AArch64 Advanced SIMD Support
04279 //===----------------------------------------------------------------------===//
04280 
04281 /// WidenVector - Given a value in the V64 register class, produce the
04282 /// equivalent value in the V128 register class.
04283 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
04284   EVT VT = V64Reg.getValueType();
04285   unsigned NarrowSize = VT.getVectorNumElements();
04286   MVT EltTy = VT.getVectorElementType().getSimpleVT();
04287   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
04288   SDLoc DL(V64Reg);
04289 
04290   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
04291                      V64Reg, DAG.getConstant(0, MVT::i32));
04292 }
04293 
04294 /// getExtFactor - Determine the adjustment factor for the position when
04295 /// generating an "extract from vector registers" instruction.
04296 static unsigned getExtFactor(SDValue &V) {
04297   EVT EltType = V.getValueType().getVectorElementType();
04298   return EltType.getSizeInBits() / 8;
04299 }
04300 
04301 /// NarrowVector - Given a value in the V128 register class, produce the
04302 /// equivalent value in the V64 register class.
04303 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
04304   EVT VT = V128Reg.getValueType();
04305   unsigned WideSize = VT.getVectorNumElements();
04306   MVT EltTy = VT.getVectorElementType().getSimpleVT();
04307   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
04308   SDLoc DL(V128Reg);
04309 
04310   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
04311 }
04312 
04313 // Gather data to see if the operation can be modelled as a
04314 // shuffle in combination with VEXTs.
04315 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
04316                                                   SelectionDAG &DAG) const {
04317   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
04318   SDLoc dl(Op);
04319   EVT VT = Op.getValueType();
04320   unsigned NumElts = VT.getVectorNumElements();
04321 
04322   struct ShuffleSourceInfo {
04323     SDValue Vec;
04324     unsigned MinElt;
04325     unsigned MaxElt;
04326 
04327     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
04328     // be compatible with the shuffle we intend to construct. As a result
04329     // ShuffleVec will be some sliding window into the original Vec.
04330     SDValue ShuffleVec;
04331 
04332     // Code should guarantee that element i in Vec starts at element "WindowBase
04333     // + i * WindowScale in ShuffleVec".
04334     int WindowBase;
04335     int WindowScale;
04336 
04337     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
04338     ShuffleSourceInfo(SDValue Vec)
04339         : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
04340           WindowScale(1) {}
04341   };
04342 
04343   // First gather all vectors used as an immediate source for this BUILD_VECTOR
04344   // node.
04345   SmallVector<ShuffleSourceInfo, 2> Sources;
04346   for (unsigned i = 0; i < NumElts; ++i) {
04347     SDValue V = Op.getOperand(i);
04348     if (V.getOpcode() == ISD::UNDEF)
04349       continue;
04350     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
04351       // A shuffle can only come from building a vector from various
04352       // elements of other vectors.
04353       return SDValue();
04354     }
04355 
04356     // Add this element source to the list if it's not already there.
04357     SDValue SourceVec = V.getOperand(0);
04358     auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
04359     if (Source == Sources.end())
04360       Sources.push_back(ShuffleSourceInfo(SourceVec));
04361 
04362     // Update the minimum and maximum lane number seen.
04363     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
04364     Source->MinElt = std::min(Source->MinElt, EltNo);
04365     Source->MaxElt = std::max(Source->MaxElt, EltNo);
04366   }
04367 
04368   // Currently only do something sane when at most two source vectors
04369   // are involved.
04370   if (Sources.size() > 2)
04371     return SDValue();
04372 
04373   // Find out the smallest element size among result and two sources, and use
04374   // it as element size to build the shuffle_vector.
04375   EVT SmallestEltTy = VT.getVectorElementType();
04376   for (auto &Source : Sources) {
04377     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
04378     if (SrcEltTy.bitsLT(SmallestEltTy)) {
04379       SmallestEltTy = SrcEltTy;
04380     }
04381   }
04382   unsigned ResMultiplier =
04383       VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
04384   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
04385   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
04386 
04387   // If the source vector is too wide or too narrow, we may nevertheless be able
04388   // to construct a compatible shuffle either by concatenating it with UNDEF or
04389   // extracting a suitable range of elements.
04390   for (auto &Src : Sources) {
04391     EVT SrcVT = Src.ShuffleVec.getValueType();
04392 
04393     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
04394       continue;
04395 
04396     // This stage of the search produces a source with the same element type as
04397     // the original, but with a total width matching the BUILD_VECTOR output.
04398     EVT EltVT = SrcVT.getVectorElementType();
04399     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
04400                                   VT.getSizeInBits() / EltVT.getSizeInBits());
04401 
04402     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
04403       assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
04404       // We can pad out the smaller vector for free, so if it's part of a
04405       // shuffle...
04406       Src.ShuffleVec =
04407           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
04408                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
04409       continue;
04410     }
04411 
04412     assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
04413 
04414     if (Src.MaxElt - Src.MinElt >= NumElts) {
04415       // Span too large for a VEXT to cope
04416       return SDValue();
04417     }
04418 
04419     if (Src.MinElt >= NumElts) {
04420       // The extraction can just take the second half
04421       Src.ShuffleVec =
04422           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
04423                       DAG.getIntPtrConstant(NumElts));
04424       Src.WindowBase = -NumElts;
04425     } else if (Src.MaxElt < NumElts) {
04426       // The extraction can just take the first half
04427       Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
04428                                    Src.ShuffleVec, DAG.getIntPtrConstant(0));
04429     } else {
04430       // An actual VEXT is needed
04431       SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
04432                                      Src.ShuffleVec, DAG.getIntPtrConstant(0));
04433       SDValue VEXTSrc2 =
04434           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
04435                       DAG.getIntPtrConstant(NumElts));
04436       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
04437 
04438       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
04439                                    VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
04440       Src.WindowBase = -Src.MinElt;
04441     }
04442   }
04443 
04444   // Another possible incompatibility occurs from the vector element types. We
04445   // can fix this by bitcasting the source vectors to the same type we intend
04446   // for the shuffle.
04447   for (auto &Src : Sources) {
04448     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
04449     if (SrcEltTy == SmallestEltTy)
04450       continue;
04451     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
04452     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
04453     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
04454     Src.WindowBase *= Src.WindowScale;
04455   }
04456 
04457   // Final sanity check before we try to actually produce a shuffle.
04458   DEBUG(
04459     for (auto Src : Sources)
04460       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
04461   );
04462 
04463   // The stars all align, our next step is to produce the mask for the shuffle.
04464   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
04465   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
04466   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
04467     SDValue Entry = Op.getOperand(i);
04468     if (Entry.getOpcode() == ISD::UNDEF)
04469       continue;
04470 
04471     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
04472     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
04473 
04474     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
04475     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
04476     // segment.
04477     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
04478     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
04479                                VT.getVectorElementType().getSizeInBits());
04480     int LanesDefined = BitsDefined / BitsPerShuffleLane;
04481 
04482     // This source is expected to fill ResMultiplier lanes of the final shuffle,
04483     // starting at the appropriate offset.
04484     int *LaneMask = &Mask[i * ResMultiplier];
04485 
04486     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
04487     ExtractBase += NumElts * (Src - Sources.begin());
04488     for (int j = 0; j < LanesDefined; ++j)
04489       LaneMask[j] = ExtractBase + j;
04490   }
04491 
04492   // Final check before we try to produce nonsense...
04493   if (!isShuffleMaskLegal(Mask, ShuffleVT))
04494     return SDValue();
04495 
04496   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
04497   for (unsigned i = 0; i < Sources.size(); ++i)
04498     ShuffleOps[i] = Sources[i].ShuffleVec;
04499 
04500   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
04501                                          ShuffleOps[1], &Mask[0]);
04502   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
04503 }
04504 
04505 // check if an EXT instruction can handle the shuffle mask when the
04506 // vector sources of the shuffle are the same.
04507 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
04508   unsigned NumElts = VT.getVectorNumElements();
04509 
04510   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
04511   if (M[0] < 0)
04512     return false;
04513 
04514   Imm = M[0];
04515 
04516   // If this is a VEXT shuffle, the immediate value is the index of the first
04517   // element.  The other shuffle indices must be the successive elements after
04518   // the first one.
04519   unsigned ExpectedElt = Imm;
04520   for (unsigned i = 1; i < NumElts; ++i) {
04521     // Increment the expected index.  If it wraps around, just follow it
04522     // back to index zero and keep going.
04523     ++ExpectedElt;
04524     if (ExpectedElt == NumElts)
04525       ExpectedElt = 0;
04526 
04527     if (M[i] < 0)
04528       continue; // ignore UNDEF indices
04529     if (ExpectedElt != static_cast<unsigned>(M[i]))
04530       return false;
04531   }
04532 
04533   return true;
04534 }
04535 
04536 // check if an EXT instruction can handle the shuffle mask when the
04537 // vector sources of the shuffle are different.
04538 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
04539                       unsigned &Imm) {
04540   // Look for the first non-undef element.
04541   const int *FirstRealElt = std::find_if(M.begin(), M.end(),
04542       [](int Elt) {return Elt >= 0;});
04543 
04544   // Benefit form APInt to handle overflow when calculating expected element.
04545   unsigned NumElts = VT.getVectorNumElements();
04546   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
04547   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
04548   // The following shuffle indices must be the successive elements after the
04549   // first real element.
04550   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
04551       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
04552   if (FirstWrongElt != M.end())
04553     return false;
04554 
04555   // The index of an EXT is the first element if it is not UNDEF.
04556   // Watch out for the beginning UNDEFs. The EXT index should be the expected
04557   // value of the first element.  E.g. 
04558   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
04559   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
04560   // ExpectedElt is the last mask index plus 1.
04561   Imm = ExpectedElt.getZExtValue();
04562 
04563   // There are two difference cases requiring to reverse input vectors.
04564   // For example, for vector <4 x i32> we have the following cases,
04565   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
04566   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
04567   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
04568   // to reverse two input vectors.
04569   if (Imm < NumElts)
04570     ReverseEXT = true;
04571   else
04572     Imm -= NumElts;
04573 
04574   return true;
04575 }
04576 
04577 /// isREVMask - Check if a vector shuffle corresponds to a REV
04578 /// instruction with the specified blocksize.  (The order of the elements
04579 /// within each block of the vector is reversed.)
04580 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
04581   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
04582          "Only possible block sizes for REV are: 16, 32, 64");
04583 
04584   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
04585   if (EltSz == 64)
04586     return false;
04587 
04588   unsigned NumElts = VT.getVectorNumElements();
04589   unsigned BlockElts = M[0] + 1;
04590   // If the first shuffle index is UNDEF, be optimistic.
04591   if (M[0] < 0)
04592     BlockElts = BlockSize / EltSz;
04593 
04594   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
04595     return false;
04596 
04597   for (unsigned i = 0; i < NumElts; ++i) {
04598     if (M[i] < 0)
04599       continue; // ignore UNDEF indices
04600     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
04601       return false;
04602   }
04603 
04604   return true;
04605 }
04606 
04607 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
04608   unsigned NumElts = VT.getVectorNumElements();
04609   WhichResult = (M[0] == 0 ? 0 : 1);
04610   unsigned Idx = WhichResult * NumElts / 2;
04611   for (unsigned i = 0; i != NumElts; i += 2) {
04612     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
04613         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
04614       return false;
04615     Idx += 1;
04616   }
04617 
04618   return true;
04619 }
04620 
04621 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
04622   unsigned NumElts = VT.getVectorNumElements();
04623   WhichResult = (M[0] == 0 ? 0 : 1);
04624   for (unsigned i = 0; i != NumElts; ++i) {
04625     if (M[i] < 0)
04626       continue; // ignore UNDEF indices
04627     if ((unsigned)M[i] != 2 * i + WhichResult)
04628       return false;
04629   }
04630 
04631   return true;
04632 }
04633 
04634 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
04635   unsigned NumElts = VT.getVectorNumElements();
04636   WhichResult = (M[0] == 0 ? 0 : 1);
04637   for (unsigned i = 0; i < NumElts; i += 2) {
04638     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
04639         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
04640       return false;
04641   }
04642   return true;
04643 }
04644 
04645 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
04646 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
04647 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
04648 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
04649   unsigned NumElts = VT.getVectorNumElements();
04650   WhichResult = (M[0] == 0 ? 0 : 1);
04651   unsigned Idx = WhichResult * NumElts / 2;
04652   for (unsigned i = 0; i != NumElts; i += 2) {
04653     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
04654         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
04655       return false;
04656     Idx += 1;
04657   }
04658 
04659   return true;
04660 }
04661 
04662 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
04663 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
04664 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
04665 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
04666   unsigned Half = VT.getVectorNumElements() / 2;
04667   WhichResult = (M[0] == 0 ? 0 : 1);
04668   for (unsigned j = 0; j != 2; ++j) {
04669     unsigned Idx = WhichResult;
04670     for (unsigned i = 0; i != Half; ++i) {
04671       int MIdx = M[i + j * Half];
04672       if (MIdx >= 0 && (unsigned)MIdx != Idx)
04673         return false;
04674       Idx += 2;
04675     }
04676   }
04677 
04678   return true;
04679 }
04680 
04681 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
04682 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
04683 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
04684 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
04685   unsigned NumElts = VT.getVectorNumElements();
04686   WhichResult = (M[0] == 0 ? 0 : 1);
04687   for (unsigned i = 0; i < NumElts; i += 2) {
04688     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
04689         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
04690       return false;
04691   }
04692   return true;
04693 }
04694 
04695 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
04696                       bool &DstIsLeft, int &Anomaly) {
04697   if (M.size() != static_cast<size_t>(NumInputElements))
04698     return false;
04699 
04700   int NumLHSMatch = 0, NumRHSMatch = 0;
04701   int LastLHSMismatch = -1, LastRHSMismatch = -1;
04702 
04703   for (int i = 0; i < NumInputElements; ++i) {
04704     if (M[i] == -1) {
04705       ++NumLHSMatch;
04706       ++NumRHSMatch;
04707       continue;
04708     }
04709 
04710     if (M[i] == i)
04711       ++NumLHSMatch;
04712     else
04713       LastLHSMismatch = i;
04714 
04715     if (M[i] == i + NumInputElements)
04716       ++NumRHSMatch;
04717     else
04718       LastRHSMismatch = i;
04719   }
04720 
04721   if (NumLHSMatch == NumInputElements - 1) {
04722     DstIsLeft = true;
04723     Anomaly = LastLHSMismatch;
04724     return true;
04725   } else if (NumRHSMatch == NumInputElements - 1) {
04726     DstIsLeft = false;
04727     Anomaly = LastRHSMismatch;
04728     return true;
04729   }
04730 
04731   return false;
04732 }
04733 
04734 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
04735   if (VT.getSizeInBits() != 128)
04736     return false;
04737 
04738   unsigned NumElts = VT.getVectorNumElements();
04739 
04740   for (int I = 0, E = NumElts / 2; I != E; I++) {
04741     if (Mask[I] != I)
04742       return false;
04743   }
04744 
04745   int Offset = NumElts / 2;
04746   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
04747     if (Mask[I] != I + SplitLHS * Offset)
04748       return false;
04749   }
04750 
04751   return true;
04752 }
04753 
04754 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
04755   SDLoc DL(Op);
04756   EVT VT = Op.getValueType();
04757   SDValue V0 = Op.getOperand(0);
04758   SDValue V1 = Op.getOperand(1);
04759   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
04760 
04761   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
04762       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
04763     return SDValue();
04764 
04765   bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
04766 
04767   if (!isConcatMask(Mask, VT, SplitV0))
04768     return SDValue();
04769 
04770   EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
04771                                 VT.getVectorNumElements() / 2);
04772   if (SplitV0) {
04773     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
04774                      DAG.getConstant(0, MVT::i64));
04775   }
04776   if (V1.getValueType().getSizeInBits() == 128) {
04777     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
04778                      DAG.getConstant(0, MVT::i64));
04779   }
04780   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
04781 }
04782 
04783 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
04784 /// the specified operations to build the shuffle.
04785 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
04786                                       SDValue RHS, SelectionDAG &DAG,
04787                                       SDLoc dl) {
04788   unsigned OpNum = (PFEntry >> 26) & 0x0F;
04789   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
04790   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
04791 
04792   enum {
04793     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
04794     OP_VREV,
04795     OP_VDUP0,
04796     OP_VDUP1,
04797     OP_VDUP2,
04798     OP_VDUP3,
04799     OP_VEXT1,
04800     OP_VEXT2,
04801     OP_VEXT3,
04802     OP_VUZPL, // VUZP, left result
04803     OP_VUZPR, // VUZP, right result
04804     OP_VZIPL, // VZIP, left result
04805     OP_VZIPR, // VZIP, right result
04806     OP_VTRNL, // VTRN, left result
04807     OP_VTRNR  // VTRN, right result
04808   };
04809 
04810   if (OpNum == OP_COPY) {
04811     if (LHSID == (1 * 9 + 2) * 9 + 3)
04812       return LHS;
04813     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
04814     return RHS;
04815   }
04816 
04817   SDValue OpLHS, OpRHS;
04818   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
04819   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
04820   EVT VT = OpLHS.getValueType();
04821 
04822   switch (OpNum) {
04823   default:
04824     llvm_unreachable("Unknown shuffle opcode!");
04825   case OP_VREV:
04826     // VREV divides the vector in half and swaps within the half.
04827     if (VT.getVectorElementType() == MVT::i32 ||
04828         VT.getVectorElementType() == MVT::f32)
04829       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
04830     // vrev <4 x i16> -> REV32
04831     if (VT.getVectorElementType() == MVT::i16 ||
04832         VT.getVectorElementType() == MVT::f16)
04833       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
04834     // vrev <4 x i8> -> REV16
04835     assert(VT.getVectorElementType() == MVT::i8);
04836     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
04837   case OP_VDUP0:
04838   case OP_VDUP1:
04839   case OP_VDUP2:
04840   case OP_VDUP3: {
04841     EVT EltTy = VT.getVectorElementType();
04842     unsigned Opcode;
04843     if (EltTy == MVT::i8)
04844       Opcode = AArch64ISD::DUPLANE8;
04845     else if (EltTy == MVT::i16)
04846       Opcode = AArch64ISD::DUPLANE16;
04847     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
04848       Opcode = AArch64ISD::DUPLANE32;
04849     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
04850       Opcode = AArch64ISD::DUPLANE64;
04851     else
04852       llvm_unreachable("Invalid vector element type?");
04853 
04854     if (VT.getSizeInBits() == 64)
04855       OpLHS = WidenVector(OpLHS, DAG);
04856     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
04857     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
04858   }
04859   case OP_VEXT1:
04860   case OP_VEXT2:
04861   case OP_VEXT3: {
04862     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
04863     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
04864                        DAG.getConstant(Imm, MVT::i32));
04865   }
04866   case OP_VUZPL:
04867     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
04868                        OpRHS);
04869   case OP_VUZPR:
04870     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
04871                        OpRHS);
04872   case OP_VZIPL:
04873     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
04874                        OpRHS);
04875   case OP_VZIPR:
04876     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
04877                        OpRHS);
04878   case OP_VTRNL:
04879     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
04880                        OpRHS);
04881   case OP_VTRNR:
04882     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
04883                        OpRHS);
04884   }
04885 }
04886 
04887 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
04888                            SelectionDAG &DAG) {
04889   // Check to see if we can use the TBL instruction.
04890   SDValue V1 = Op.getOperand(0);
04891   SDValue V2 = Op.getOperand(1);
04892   SDLoc DL(Op);
04893 
04894   EVT EltVT = Op.getValueType().getVectorElementType();
04895   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
04896 
04897   SmallVector<SDValue, 8> TBLMask;
04898   for (int Val : ShuffleMask) {
04899     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
04900       unsigned Offset = Byte + Val * BytesPerElt;
04901       TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
04902     }
04903   }
04904 
04905   MVT IndexVT = MVT::v8i8;
04906   unsigned IndexLen = 8;
04907   if (Op.getValueType().getSizeInBits() == 128) {
04908     IndexVT = MVT::v16i8;
04909     IndexLen = 16;
04910   }
04911 
04912   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
04913   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
04914 
04915   SDValue Shuffle;
04916   if (V2.getNode()->getOpcode() == ISD::UNDEF) {
04917     if (IndexLen == 8)
04918       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
04919     Shuffle = DAG.getNode(
04920         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
04921         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
04922         DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
04923                     makeArrayRef(TBLMask.data(), IndexLen)));
04924   } else {
04925     if (IndexLen == 8) {
04926       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
04927       Shuffle = DAG.getNode(
04928           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
04929           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
04930           DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
04931                       makeArrayRef(TBLMask.data(), IndexLen)));
04932     } else {
04933       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
04934       // cannot currently represent the register constraints on the input
04935       // table registers.
04936       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
04937       //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
04938       //                               &TBLMask[0], IndexLen));
04939       Shuffle = DAG.getNode(
04940           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
04941           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
04942           DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
04943                       makeArrayRef(TBLMask.data(), IndexLen)));
04944     }
04945   }
04946   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
04947 }
04948 
04949 static unsigned getDUPLANEOp(EVT EltType) {
04950   if (EltType == MVT::i8)
04951     return AArch64ISD::DUPLANE8;
04952   if (EltType == MVT::i16 || EltType == MVT::f16)
04953     return AArch64ISD::DUPLANE16;
04954   if (EltType == MVT::i32 || EltType == MVT::f32)
04955     return AArch64ISD::DUPLANE32;
04956   if (EltType == MVT::i64 || EltType == MVT::f64)
04957     return AArch64ISD::DUPLANE64;
04958 
04959   llvm_unreachable("Invalid vector element type?");
04960 }
04961 
04962 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
04963                                                    SelectionDAG &DAG) const {
04964   SDLoc dl(Op);
04965   EVT VT = Op.getValueType();
04966 
04967   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
04968 
04969   // Convert shuffles that are directly supported on NEON to target-specific
04970   // DAG nodes, instead of keeping them as shuffles and matching them again
04971   // during code selection.  This is more efficient and avoids the possibility
04972   // of inconsistencies between legalization and selection.
04973   ArrayRef<int> ShuffleMask = SVN->getMask();
04974 
04975   SDValue V1 = Op.getOperand(0);
04976   SDValue V2 = Op.getOperand(1);
04977 
04978   if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
04979                                        V1.getValueType().getSimpleVT())) {
04980     int Lane = SVN->getSplatIndex();
04981     // If this is undef splat, generate it via "just" vdup, if possible.
04982     if (Lane == -1)
04983       Lane = 0;
04984 
04985     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
04986       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
04987                          V1.getOperand(0));
04988     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
04989     // constant. If so, we can just reference the lane's definition directly.
04990     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
04991         !isa<ConstantSDNode>(V1.getOperand(Lane)))
04992       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
04993 
04994     // Otherwise, duplicate from the lane of the input vector.
04995     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
04996 
04997     // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
04998     // to make a vector of the same size as this SHUFFLE. We can ignore the
04999     // extract entirely, and canonicalise the concat using WidenVector.
05000     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
05001       Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
05002       V1 = V1.getOperand(0);
05003     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
05004       unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
05005       Lane -= Idx * VT.getVectorNumElements() / 2;
05006       V1 = WidenVector(V1.getOperand(Idx), DAG);
05007     } else if (VT.getSizeInBits() == 64)
05008       V1 = WidenVector(V1, DAG);
05009 
05010     return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
05011   }
05012 
05013   if (isREVMask(ShuffleMask, VT, 64))
05014     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
05015   if (isREVMask(ShuffleMask, VT, 32))
05016     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
05017   if (isREVMask(ShuffleMask, VT, 16))
05018     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
05019 
05020   bool ReverseEXT = false;
05021   unsigned Imm;
05022   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
05023     if (ReverseEXT)
05024       std::swap(V1, V2);
05025     Imm *= getExtFactor(V1);
05026     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
05027                        DAG.getConstant(Imm, MVT::i32));
05028   } else if (V2->getOpcode() == ISD::UNDEF &&
05029              isSingletonEXTMask(ShuffleMask, VT, Imm)) {
05030     Imm *= getExtFactor(V1);
05031     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
05032                        DAG.getConstant(Imm, MVT::i32));
05033   }
05034 
05035   unsigned WhichResult;
05036   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
05037     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
05038     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
05039   }
05040   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
05041     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
05042     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
05043   }
05044   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
05045     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
05046     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
05047   }
05048 
05049   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
05050     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
05051     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
05052   }
05053   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
05054     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
05055     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
05056   }
05057   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
05058     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
05059     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
05060   }
05061 
05062   SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
05063   if (Concat.getNode())
05064     return Concat;
05065 
05066   bool DstIsLeft;
05067   int Anomaly;
05068   int NumInputElements = V1.getValueType().getVectorNumElements();
05069   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
05070     SDValue DstVec = DstIsLeft ? V1 : V2;
05071     SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
05072 
05073     SDValue SrcVec = V1;
05074     int SrcLane = ShuffleMask[Anomaly];
05075     if (SrcLane >= NumInputElements) {
05076       SrcVec = V2;
05077       SrcLane -= VT.getVectorNumElements();
05078     }
05079     SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
05080 
05081     EVT ScalarVT = VT.getVectorElementType();
05082 
05083     if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
05084       ScalarVT = MVT::i32;
05085 
05086     return DAG.getNode(
05087         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
05088         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
05089         DstLaneV);
05090   }
05091 
05092   // If the shuffle is not directly supported and it has 4 elements, use
05093   // the PerfectShuffle-generated table to synthesize it from other shuffles.
05094   unsigned NumElts = VT.getVectorNumElements();
05095   if (NumElts == 4) {
05096     unsigned PFIndexes[4];
05097     for (unsigned i = 0; i != 4; ++i) {
05098       if (ShuffleMask[i] < 0)
05099         PFIndexes[i] = 8;
05100       else
05101         PFIndexes[i] = ShuffleMask[i];
05102     }
05103 
05104     // Compute the index in the perfect shuffle table.
05105     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
05106                             PFIndexes[2] * 9 + PFIndexes[3];
05107     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
05108     unsigned Cost = (PFEntry >> 30);
05109 
05110     if (Cost <= 4)
05111       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
05112   }
05113 
05114   return GenerateTBL(Op, ShuffleMask, DAG);
05115 }
05116 
05117 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
05118                                APInt &UndefBits) {
05119   EVT VT = BVN->getValueType(0);
05120   APInt SplatBits, SplatUndef;
05121   unsigned SplatBitSize;
05122   bool HasAnyUndefs;
05123   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
05124     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
05125 
05126     for (unsigned i = 0; i < NumSplats; ++i) {
05127       CnstBits <<= SplatBitSize;
05128       UndefBits <<= SplatBitSize;
05129       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
05130       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
05131     }
05132 
05133     return true;
05134   }
05135 
05136   return false;
05137 }
05138 
05139 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
05140                                               SelectionDAG &DAG) const {
05141   BuildVectorSDNode *BVN =
05142       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
05143   SDValue LHS = Op.getOperand(0);
05144   SDLoc dl(Op);
05145   EVT VT = Op.getValueType();
05146 
05147   if (!BVN)
05148     return Op;
05149 
05150   APInt CnstBits(VT.getSizeInBits(), 0);
05151   APInt UndefBits(VT.getSizeInBits(), 0);
05152   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
05153     // We only have BIC vector immediate instruction, which is and-not.
05154     CnstBits = ~CnstBits;
05155 
05156     // We make use of a little bit of goto ickiness in order to avoid having to
05157     // duplicate the immediate matching logic for the undef toggled case.
05158     bool SecondTry = false;
05159   AttemptModImm:
05160 
05161     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
05162       CnstBits = CnstBits.zextOrTrunc(64);
05163       uint64_t CnstVal = CnstBits.getZExtValue();
05164 
05165       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
05166         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
05167         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05168         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
05169                                   DAG.getConstant(CnstVal, MVT::i32),
05170                                   DAG.getConstant(0, MVT::i32));
05171         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05172       }
05173 
05174       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
05175         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
05176         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05177         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
05178                                   DAG.getConstant(CnstVal, MVT::i32),
05179                                   DAG.getConstant(8, MVT::i32));
05180         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05181       }
05182 
05183       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
05184         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
05185         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05186         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
05187                                   DAG.getConstant(CnstVal, MVT::i32),
05188                                   DAG.getConstant(16, MVT::i32));
05189         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05190       }
05191 
05192       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
05193         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
05194         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05195         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
05196                                   DAG.getConstant(CnstVal, MVT::i32),
05197                                   DAG.getConstant(24, MVT::i32));
05198         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05199       }
05200 
05201       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
05202         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
05203         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05204         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
05205                                   DAG.getConstant(CnstVal, MVT::i32),
05206                                   DAG.getConstant(0, MVT::i32));
05207         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05208       }
05209 
05210       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
05211         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
05212         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05213         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
05214                                   DAG.getConstant(CnstVal, MVT::i32),
05215                                   DAG.getConstant(8, MVT::i32));
05216         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05217       }
05218     }
05219 
05220     if (SecondTry)
05221       goto FailedModImm;
05222     SecondTry = true;
05223     CnstBits = ~UndefBits;
05224     goto AttemptModImm;
05225   }
05226 
05227 // We can always fall back to a non-immediate AND.
05228 FailedModImm:
05229   return Op;
05230 }
05231 
05232 // Specialized code to quickly find if PotentialBVec is a BuildVector that
05233 // consists of only the same constant int value, returned in reference arg
05234 // ConstVal
05235 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
05236                                      uint64_t &ConstVal) {
05237   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
05238   if (!Bvec)
05239     return false;
05240   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
05241   if (!FirstElt)
05242     return false;
05243   EVT VT = Bvec->getValueType(0);
05244   unsigned NumElts = VT.getVectorNumElements();
05245   for (unsigned i = 1; i < NumElts; ++i)
05246     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
05247       return false;
05248   ConstVal = FirstElt->getZExtValue();
05249   return true;
05250 }
05251 
05252 static unsigned getIntrinsicID(const SDNode *N) {
05253   unsigned Opcode = N->getOpcode();
05254   switch (Opcode) {
05255   default:
05256     return Intrinsic::not_intrinsic;
05257   case ISD::INTRINSIC_WO_CHAIN: {
05258     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
05259     if (IID < Intrinsic::num_intrinsics)
05260       return IID;
05261     return Intrinsic::not_intrinsic;
05262   }
05263   }
05264 }
05265 
05266 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
05267 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
05268 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
05269 // Also, logical shift right -> sri, with the same structure.
05270 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
05271   EVT VT = N->getValueType(0);
05272 
05273   if (!VT.isVector())
05274     return SDValue();
05275 
05276   SDLoc DL(N);
05277 
05278   // Is the first op an AND?
05279   const SDValue And = N->getOperand(0);
05280   if (And.getOpcode() != ISD::AND)
05281     return SDValue();
05282 
05283   // Is the second op an shl or lshr?
05284   SDValue Shift = N->getOperand(1);
05285   // This will have been turned into: AArch64ISD::VSHL vector, #shift
05286   // or AArch64ISD::VLSHR vector, #shift
05287   unsigned ShiftOpc = Shift.getOpcode();
05288   if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
05289     return SDValue();
05290   bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
05291 
05292   // Is the shift amount constant?
05293   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
05294   if (!C2node)
05295     return SDValue();
05296 
05297   // Is the and mask vector all constant?
05298   uint64_t C1;
05299   if (!isAllConstantBuildVector(And.getOperand(1), C1))
05300     return SDValue();
05301 
05302   // Is C1 == ~C2, taking into account how much one can shift elements of a
05303   // particular size?
05304   uint64_t C2 = C2node->getZExtValue();
05305   unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
05306   if (C2 > ElemSizeInBits)
05307     return SDValue();
05308   unsigned ElemMask = (1 << ElemSizeInBits) - 1;
05309   if ((C1 & ElemMask) != (~C2 & ElemMask))
05310     return SDValue();
05311 
05312   SDValue X = And.getOperand(0);
05313   SDValue Y = Shift.getOperand(0);
05314 
05315   unsigned Intrin =
05316       IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
05317   SDValue ResultSLI =
05318       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
05319                   DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
05320 
05321   DEBUG(dbgs() << "aarch64-lower: transformed: \n");
05322   DEBUG(N->dump(&DAG));
05323   DEBUG(dbgs() << "into: \n");
05324   DEBUG(ResultSLI->dump(&DAG));
05325 
05326   ++NumShiftInserts;
05327   return ResultSLI;
05328 }
05329 
05330 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
05331                                              SelectionDAG &DAG) const {
05332   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
05333   if (EnableAArch64SlrGeneration) {
05334     SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
05335     if (Res.getNode())
05336       return Res;
05337   }
05338 
05339   BuildVectorSDNode *BVN =
05340       dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
05341   SDValue LHS = Op.getOperand(1);
05342   SDLoc dl(Op);
05343   EVT VT = Op.getValueType();
05344 
05345   // OR commutes, so try swapping the operands.
05346   if (!BVN) {
05347     LHS = Op.getOperand(0);
05348     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
05349   }
05350   if (!BVN)
05351     return Op;
05352 
05353   APInt CnstBits(VT.getSizeInBits(), 0);
05354   APInt UndefBits(VT.getSizeInBits(), 0);
05355   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
05356     // We make use of a little bit of goto ickiness in order to avoid having to
05357     // duplicate the immediate matching logic for the undef toggled case.
05358     bool SecondTry = false;
05359   AttemptModImm:
05360 
05361     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
05362       CnstBits = CnstBits.zextOrTrunc(64);
05363       uint64_t CnstVal = CnstBits.getZExtValue();
05364 
05365       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
05366         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
05367         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05368         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
05369                                   DAG.getConstant(CnstVal, MVT::i32),
05370                                   DAG.getConstant(0, MVT::i32));
05371         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05372       }
05373 
05374       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
05375         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
05376         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05377         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
05378                                   DAG.getConstant(CnstVal, MVT::i32),
05379                                   DAG.getConstant(8, MVT::i32));
05380         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05381       }
05382 
05383       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
05384         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
05385         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05386         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
05387                                   DAG.getConstant(CnstVal, MVT::i32),
05388                                   DAG.getConstant(16, MVT::i32));
05389         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05390       }
05391 
05392       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
05393         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
05394         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05395         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
05396                                   DAG.getConstant(CnstVal, MVT::i32),
05397                                   DAG.getConstant(24, MVT::i32));
05398         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05399       }
05400 
05401       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
05402         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
05403         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05404         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
05405                                   DAG.getConstant(CnstVal, MVT::i32),
05406                                   DAG.getConstant(0, MVT::i32));
05407         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05408       }
05409 
05410       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
05411         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
05412         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05413         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
05414                                   DAG.getConstant(CnstVal, MVT::i32),
05415                                   DAG.getConstant(8, MVT::i32));
05416         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05417       }
05418     }
05419 
05420     if (SecondTry)
05421       goto FailedModImm;
05422     SecondTry = true;
05423     CnstBits = UndefBits;
05424     goto AttemptModImm;
05425   }
05426 
05427 // We can always fall back to a non-immediate OR.
05428 FailedModImm:
05429   return Op;
05430 }
05431 
05432 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
05433 // be truncated to fit element width.
05434 static SDValue NormalizeBuildVector(SDValue Op,
05435                                     SelectionDAG &DAG) {
05436   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
05437   SDLoc dl(Op);
05438   EVT VT = Op.getValueType();
05439   EVT EltTy= VT.getVectorElementType();
05440 
05441   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
05442     return Op;
05443 
05444   SmallVector<SDValue, 16> Ops;
05445   for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
05446     SDValue Lane = Op.getOperand(I);
05447     if (Lane.getOpcode() == ISD::Constant) {
05448       APInt LowBits(EltTy.getSizeInBits(),
05449                     cast<ConstantSDNode>(Lane)->getZExtValue());
05450       Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
05451     }
05452     Ops.push_back(Lane);
05453   }
05454   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05455 }
05456 
05457 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
05458                                                  SelectionDAG &DAG) const {
05459   SDLoc dl(Op);
05460   EVT VT = Op.getValueType();
05461   Op = NormalizeBuildVector(Op, DAG);
05462   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
05463 
05464   APInt CnstBits(VT.getSizeInBits(), 0);
05465   APInt UndefBits(VT.getSizeInBits(), 0);
05466   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
05467     // We make use of a little bit of goto ickiness in order to avoid having to
05468     // duplicate the immediate matching logic for the undef toggled case.
05469     bool SecondTry = false;
05470   AttemptModImm:
05471 
05472     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
05473       CnstBits = CnstBits.zextOrTrunc(64);
05474       uint64_t CnstVal = CnstBits.getZExtValue();
05475 
05476       // Certain magic vector constants (used to express things like NOT
05477       // and NEG) are passed through unmodified.  This allows codegen patterns
05478       // for these operations to match.  Special-purpose patterns will lower
05479       // these immediates to MOVIs if it proves necessary.
05480       if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
05481         return Op;
05482 
05483       // The many faces of MOVI...
05484       if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
05485         CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
05486         if (VT.getSizeInBits() == 128) {
05487           SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
05488                                     DAG.getConstant(CnstVal, MVT::i32));
05489           return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05490         }
05491 
05492         // Support the V64 version via subregister insertion.
05493         SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
05494                                   DAG.getConstant(CnstVal, MVT::i32));
05495         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05496       }
05497 
05498       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
05499         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
05500         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05501         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
05502                                   DAG.getConstant(CnstVal, MVT::i32),
05503                                   DAG.getConstant(0, MVT::i32));
05504         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05505       }
05506 
05507       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
05508         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
05509         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05510         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
05511                                   DAG.getConstant(CnstVal, MVT::i32),
05512                                   DAG.getConstant(8, MVT::i32));
05513         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05514       }
05515 
05516       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
05517         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
05518         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05519         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
05520                                   DAG.getConstant(CnstVal, MVT::i32),
05521                                   DAG.getConstant(16, MVT::i32));
05522         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05523       }
05524 
05525       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
05526         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
05527         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05528         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
05529                                   DAG.getConstant(CnstVal, MVT::i32),
05530                                   DAG.getConstant(24, MVT::i32));
05531         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05532       }
05533 
05534       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
05535         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
05536         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05537         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
05538                                   DAG.getConstant(CnstVal, MVT::i32),
05539                                   DAG.getConstant(0, MVT::i32));
05540         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05541       }
05542 
05543       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
05544         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
05545         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05546         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
05547                                   DAG.getConstant(CnstVal, MVT::i32),
05548                                   DAG.getConstant(8, MVT::i32));
05549         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05550       }
05551 
05552       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
05553         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
05554         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05555         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
05556                                   DAG.getConstant(CnstVal, MVT::i32),
05557                                   DAG.getConstant(264, MVT::i32));
05558         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05559       }
05560 
05561       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
05562         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
05563         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05564         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
05565                                   DAG.getConstant(CnstVal, MVT::i32),
05566                                   DAG.getConstant(272, MVT::i32));
05567         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05568       }
05569 
05570       if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
05571         CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
05572         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
05573         SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
05574                                   DAG.getConstant(CnstVal, MVT::i32));
05575         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05576       }
05577 
05578       // The few faces of FMOV...
05579       if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
05580         CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
05581         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
05582         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
05583                                   DAG.getConstant(CnstVal, MVT::i32));
05584         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05585       }
05586 
05587       if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
05588           VT.getSizeInBits() == 128) {
05589         CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
05590         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
05591                                   DAG.getConstant(CnstVal, MVT::i32));
05592         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05593       }
05594 
05595       // The many faces of MVNI...
05596       CnstVal = ~CnstVal;
05597       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
05598         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
05599         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05600         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
05601                                   DAG.getConstant(CnstVal, MVT::i32),
05602                                   DAG.getConstant(0, MVT::i32));
05603         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05604       }
05605 
05606       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
05607         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
05608         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05609         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
05610                                   DAG.getConstant(CnstVal, MVT::i32),
05611                                   DAG.getConstant(8, MVT::i32));
05612         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05613       }
05614 
05615       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
05616         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
05617         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05618         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
05619                                   DAG.getConstant(CnstVal, MVT::i32),
05620                                   DAG.getConstant(16, MVT::i32));
05621         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05622       }
05623 
05624       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
05625         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
05626         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05627         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
05628                                   DAG.getConstant(CnstVal, MVT::i32),
05629                                   DAG.getConstant(24, MVT::i32));
05630         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05631       }
05632 
05633       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
05634         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
05635         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05636         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
05637                                   DAG.getConstant(CnstVal, MVT::i32),
05638                                   DAG.getConstant(0, MVT::i32));
05639         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05640       }
05641 
05642       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
05643         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
05644         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
05645         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
05646                                   DAG.getConstant(CnstVal, MVT::i32),
05647                                   DAG.getConstant(8, MVT::i32));
05648         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05649       }
05650 
05651       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
05652         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
05653         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05654         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
05655                                   DAG.getConstant(CnstVal, MVT::i32),
05656                                   DAG.getConstant(264, MVT::i32));
05657         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05658       }
05659 
05660       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
05661         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
05662         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
05663         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
05664                                   DAG.getConstant(CnstVal, MVT::i32),
05665                                   DAG.getConstant(272, MVT::i32));
05666         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
05667       }
05668     }
05669 
05670     if (SecondTry)
05671       goto FailedModImm;
05672     SecondTry = true;
05673     CnstBits = UndefBits;
05674     goto AttemptModImm;
05675   }
05676 FailedModImm:
05677 
05678   // Scan through the operands to find some interesting properties we can
05679   // exploit:
05680   //   1) If only one value is used, we can use a DUP, or
05681   //   2) if only the low element is not undef, we can just insert that, or
05682   //   3) if only one constant value is used (w/ some non-constant lanes),
05683   //      we can splat the constant value into the whole vector then fill
05684   //      in the non-constant lanes.
05685   //   4) FIXME: If different constant values are used, but we can intelligently
05686   //             select the values we'll be overwriting for the non-constant
05687   //             lanes such that we can directly materialize the vector
05688   //             some other way (MOVI, e.g.), we can be sneaky.
05689   unsigned NumElts = VT.getVectorNumElements();
05690   bool isOnlyLowElement = true;
05691   bool usesOnlyOneValue = true;
05692   bool usesOnlyOneConstantValue = true;
05693   bool isConstant = true;
05694   unsigned NumConstantLanes = 0;
05695   SDValue Value;
05696   SDValue ConstantValue;
05697   for (unsigned i = 0; i < NumElts; ++i) {
05698     SDValue V = Op.getOperand(i);
05699     if (V.getOpcode() == ISD::UNDEF)
05700       continue;
05701     if (i > 0)
05702       isOnlyLowElement = false;
05703     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
05704       isConstant = false;
05705 
05706     if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
05707       ++NumConstantLanes;
05708       if (!ConstantValue.getNode())
05709         ConstantValue = V;
05710       else if (ConstantValue != V)
05711         usesOnlyOneConstantValue = false;
05712     }
05713 
05714     if (!Value.getNode())
05715       Value = V;
05716     else if (V != Value)
05717       usesOnlyOneValue = false;
05718   }
05719 
05720   if (!Value.getNode())
05721     return DAG.getUNDEF(VT);
05722 
05723   if (isOnlyLowElement)
05724     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
05725 
05726   // Use DUP for non-constant splats.  For f32 constant splats, reduce to
05727   // i32 and try again.
05728   if (usesOnlyOneValue) {
05729     if (!isConstant) {
05730       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05731           Value.getValueType() != VT)
05732         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
05733 
05734       // This is actually a DUPLANExx operation, which keeps everything vectory.
05735 
05736       // DUPLANE works on 128-bit vectors, widen it if necessary.
05737       SDValue Lane = Value.getOperand(1);
05738       Value = Value.getOperand(0);
05739       if (Value.getValueType().getSizeInBits() == 64)
05740         Value = WidenVector(Value, DAG);
05741 
05742       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
05743       return DAG.getNode(Opcode, dl, VT, Value, Lane);
05744     }
05745 
05746     if (VT.getVectorElementType().isFloatingPoint()) {
05747       SmallVector<SDValue, 8> Ops;
05748       MVT NewType =
05749           (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
05750       for (unsigned i = 0; i < NumElts; ++i)
05751         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
05752       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
05753       SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
05754       Val = LowerBUILD_VECTOR(Val, DAG);
05755       if (Val.getNode())
05756         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
05757     }
05758   }
05759 
05760   // If there was only one constant value used and for more than one lane,
05761   // start by splatting that value, then replace the non-constant lanes. This
05762   // is better than the default, which will perform a separate initialization
05763   // for each lane.
05764   if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
05765     SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
05766     // Now insert the non-constant lanes.
05767     for (unsigned i = 0; i < NumElts; ++i) {
05768       SDValue V = Op.getOperand(i);
05769       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
05770       if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
05771         // Note that type legalization likely mucked about with the VT of the
05772         // source operand, so we may have to convert it here before inserting.
05773         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
05774       }
05775     }
05776     return Val;
05777   }
05778 
05779   // If all elements are constants and the case above didn't get hit, fall back
05780   // to the default expansion, which will generate a load from the constant
05781   // pool.
05782   if (isConstant)
05783     return SDValue();
05784 
05785   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
05786   if (NumElts >= 4) {
05787     SDValue shuffle = ReconstructShuffle(Op, DAG);
05788     if (shuffle != SDValue())
05789       return shuffle;
05790   }
05791 
05792   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
05793   // know the default expansion would otherwise fall back on something even
05794   // worse. For a vector with one or two non-undef values, that's
05795   // scalar_to_vector for the elements followed by a shuffle (provided the
05796   // shuffle is valid for the target) and materialization element by element
05797   // on the stack followed by a load for everything else.
05798   if (!isConstant && !usesOnlyOneValue) {
05799     SDValue Vec = DAG.getUNDEF(VT);
05800     SDValue Op0 = Op.getOperand(0);
05801     unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
05802     unsigned i = 0;
05803     // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
05804     // a) Avoid a RMW dependency on the full vector register, and
05805     // b) Allow the register coalescer to fold away the copy if the
05806     //    value is already in an S or D register.
05807     if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
05808       unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
05809       MachineSDNode *N =
05810           DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
05811                              DAG.getTargetConstant(SubIdx, MVT::i32));
05812       Vec = SDValue(N, 0);
05813       ++i;
05814     }
05815     for (; i < NumElts; ++i) {
05816       SDValue V = Op.getOperand(i);
05817       if (V.getOpcode() == ISD::UNDEF)
05818         continue;
05819       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
05820       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
05821     }
05822     return Vec;
05823   }
05824 
05825   // Just use the default expansion. We failed to find a better alternative.
05826   return SDValue();
05827 }
05828 
05829 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
05830                                                       SelectionDAG &DAG) const {
05831   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
05832 
05833   // Check for non-constant or out of range lane.
05834   EVT VT = Op.getOperand(0).getValueType();
05835   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
05836   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
05837     return SDValue();
05838 
05839 
05840   // Insertion/extraction are legal for V128 types.
05841   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
05842       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
05843       VT == MVT::v8f16)
05844     return Op;
05845 
05846   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
05847       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
05848     return SDValue();
05849 
05850   // For V64 types, we perform insertion by expanding the value
05851   // to a V128 type and perform the insertion on that.
05852   SDLoc DL(Op);
05853   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
05854   EVT WideTy = WideVec.getValueType();
05855 
05856   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
05857                              Op.getOperand(1), Op.getOperand(2));
05858   // Re-narrow the resultant vector.
05859   return NarrowVector(Node, DAG);
05860 }
05861 
05862 SDValue
05863 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
05864                                                SelectionDAG &DAG) const {
05865   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
05866 
05867   // Check for non-constant or out of range lane.
05868   EVT VT = Op.getOperand(0).getValueType();
05869   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
05870   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
05871     return SDValue();
05872 
05873 
05874   // Insertion/extraction are legal for V128 types.
05875   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
05876       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
05877       VT == MVT::v8f16)
05878     return Op;
05879 
05880   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
05881       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
05882     return SDValue();
05883 
05884   // For V64 types, we perform extraction by expanding the value
05885   // to a V128 type and perform the extraction on that.
05886   SDLoc DL(Op);
05887   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
05888   EVT WideTy = WideVec.getValueType();
05889 
05890   EVT ExtrTy = WideTy.getVectorElementType();
05891   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
05892     ExtrTy = MVT::i32;
05893 
05894   // For extractions, we just return the result directly.
05895   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
05896                      Op.getOperand(1));
05897 }
05898 
05899 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
05900                                                       SelectionDAG &DAG) const {
05901   EVT VT = Op.getOperand(0).getValueType();
05902   SDLoc dl(Op);
05903   // Just in case...
05904   if (!VT.isVector())
05905     return SDValue();
05906 
05907   ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
05908   if (!Cst)
05909     return SDValue();
05910   unsigned Val = Cst->getZExtValue();
05911 
05912   unsigned Size = Op.getValueType().getSizeInBits();
05913   if (Val == 0) {
05914     switch (Size) {
05915     case 8:
05916       return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
05917                                         Op.getOperand(0));
05918     case 16:
05919       return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
05920                                         Op.getOperand(0));
05921     case 32:
05922       return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
05923                                         Op.getOperand(0));
05924     case 64:
05925       return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
05926                                         Op.getOperand(0));
05927     default:
05928       llvm_unreachable("Unexpected vector type in extract_subvector!");
05929     }
05930   }
05931   // If this is extracting the upper 64-bits of a 128-bit vector, we match
05932   // that directly.
05933   if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
05934     return Op;
05935 
05936   return SDValue();
05937 }
05938 
05939 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
05940                                                EVT VT) const {
05941   if (VT.getVectorNumElements() == 4 &&
05942       (VT.is128BitVector() || VT.is64BitVector())) {
05943     unsigned PFIndexes[4];
05944     for (unsigned i = 0; i != 4; ++i) {
05945       if (M[i] < 0)
05946         PFIndexes[i] = 8;
05947       else
05948         PFIndexes[i] = M[i];
05949     }
05950 
05951     // Compute the index in the perfect shuffle table.
05952     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
05953                             PFIndexes[2] * 9 + PFIndexes[3];
05954     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
05955     unsigned Cost = (PFEntry >> 30);
05956 
05957     if (Cost <= 4)
05958       return true;
05959   }
05960 
05961   bool DummyBool;
05962   int DummyInt;
05963   unsigned DummyUnsigned;
05964 
05965   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
05966           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
05967           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
05968           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
05969           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
05970           isZIPMask(M, VT, DummyUnsigned) ||
05971           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
05972           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
05973           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
05974           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
05975           isConcatMask(M, VT, VT.getSizeInBits() == 128));
05976 }
05977 
05978 /// getVShiftImm - Check if this is a valid build_vector for the immediate
05979 /// operand of a vector shift operation, where all the elements of the
05980 /// build_vector must have the same constant integer value.
05981 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
05982   // Ignore bit_converts.
05983   while (Op.getOpcode() == ISD::BITCAST)
05984     Op = Op.getOperand(0);
05985   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
05986   APInt SplatBits, SplatUndef;
05987   unsigned SplatBitSize;
05988   bool HasAnyUndefs;
05989   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
05990                                     HasAnyUndefs, ElementBits) ||
05991       SplatBitSize > ElementBits)
05992     return false;
05993   Cnt = SplatBits.getSExtValue();
05994   return true;
05995 }
05996 
05997 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
05998 /// operand of a vector shift left operation.  That value must be in the range:
05999 ///   0 <= Value < ElementBits for a left shift; or
06000 ///   0 <= Value <= ElementBits for a long left shift.
06001 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
06002   assert(VT.isVector() && "vector shift count is not a vector type");
06003   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
06004   if (!getVShiftImm(Op, ElementBits, Cnt))
06005     return false;
06006   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
06007 }
06008 
06009 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
06010 /// operand of a vector shift right operation.  For a shift opcode, the value
06011 /// is positive, but for an intrinsic the value count must be negative. The
06012 /// absolute value must be in the range:
06013 ///   1 <= |Value| <= ElementBits for a right shift; or
06014 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
06015 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
06016                          int64_t &Cnt) {
06017   assert(VT.isVector() && "vector shift count is not a vector type");
06018   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
06019   if (!getVShiftImm(Op, ElementBits, Cnt))
06020     return false;
06021   if (isIntrinsic)
06022     Cnt = -Cnt;
06023   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
06024 }
06025 
06026 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
06027                                                       SelectionDAG &DAG) const {
06028   EVT VT = Op.getValueType();
06029   SDLoc DL(Op);
06030   int64_t Cnt;
06031 
06032   if (!Op.getOperand(1).getValueType().isVector())
06033     return Op;
06034   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
06035 
06036   switch (Op.getOpcode()) {
06037   default:
06038     llvm_unreachable("unexpected shift opcode");
06039 
06040   case ISD::SHL:
06041     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
06042       return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
06043                          DAG.getConstant(Cnt, MVT::i32));
06044     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
06045                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
06046                        Op.getOperand(0), Op.getOperand(1));
06047   case ISD::SRA:
06048   case ISD::SRL:
06049     // Right shift immediate
06050     if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
06051         Cnt < EltSize) {
06052       unsigned Opc =
06053           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
06054       return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
06055                          DAG.getConstant(Cnt, MVT::i32));
06056     }
06057 
06058     // Right shift register.  Note, there is not a shift right register
06059     // instruction, but the shift left register instruction takes a signed
06060     // value, where negative numbers specify a right shift.
06061     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
06062                                                 : Intrinsic::aarch64_neon_ushl;
06063     // negate the shift amount
06064     SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
06065     SDValue NegShiftLeft =
06066         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
06067                     DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
06068     return NegShiftLeft;
06069   }
06070 
06071   return SDValue();
06072 }
06073 
06074 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
06075                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
06076                                     SDLoc dl, SelectionDAG &DAG) {
06077   EVT SrcVT = LHS.getValueType();
06078 
06079   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
06080   APInt CnstBits(VT.getSizeInBits(), 0);
06081   APInt UndefBits(VT.getSizeInBits(), 0);
06082   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
06083   bool IsZero = IsCnst && (CnstBits == 0);
06084 
06085   if (SrcVT.getVectorElementType().isFloatingPoint()) {
06086     switch (CC) {
06087     default:
06088       return SDValue();
06089     case AArch64CC::NE: {
06090       SDValue Fcmeq;
06091       if (IsZero)
06092         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
06093       else
06094         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
06095       return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
06096     }
06097     case AArch64CC::EQ:
06098       if (IsZero)
06099         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
06100       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
06101     case AArch64CC::GE:
06102       if (IsZero)
06103         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
06104       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
06105     case AArch64CC::GT:
06106       if (IsZero)
06107         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
06108       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
06109     case AArch64CC::LS:
06110       if (IsZero)
06111         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
06112       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
06113     case AArch64CC::LT:
06114       if (!NoNans)
06115         return SDValue();
06116     // If we ignore NaNs then we can use to the MI implementation.
06117     // Fallthrough.
06118     case AArch64CC::MI:
06119       if (IsZero)
06120         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
06121       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
06122     }
06123   }
06124 
06125   switch (CC) {
06126   default:
06127     return SDValue();
06128   case AArch64CC::NE: {
06129     SDValue Cmeq;
06130     if (IsZero)
06131       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
06132     else
06133       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
06134     return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
06135   }
06136   case AArch64CC::EQ:
06137     if (IsZero)
06138       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
06139     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
06140   case AArch64CC::GE:
06141     if (IsZero)
06142       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
06143     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
06144   case AArch64CC::GT:
06145     if (IsZero)
06146       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
06147     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
06148   case AArch64CC::LE:
06149     if (IsZero)
06150       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
06151     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
06152   case AArch64CC::LS:
06153     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
06154   case AArch64CC::LO:
06155     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
06156   case AArch64CC::LT:
06157     if (IsZero)
06158       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
06159     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
06160   case AArch64CC::HI:
06161     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
06162   case AArch64CC::HS:
06163     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
06164   }
06165 }
06166 
06167 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
06168                                            SelectionDAG &DAG) const {
06169   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
06170   SDValue LHS = Op.getOperand(0);
06171   SDValue RHS = Op.getOperand(1);
06172   SDLoc dl(Op);
06173 
06174   if (LHS.getValueType().getVectorElementType().isInteger()) {
06175     assert(LHS.getValueType() == RHS.getValueType());
06176     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
06177     return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
06178                                 dl, DAG);
06179   }
06180 
06181   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
06182          LHS.getValueType().getVectorElementType() == MVT::f64);
06183 
06184   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
06185   // clean.  Some of them require two branches to implement.
06186   AArch64CC::CondCode CC1, CC2;
06187   bool ShouldInvert;
06188   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
06189 
06190   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
06191   SDValue Cmp =
06192       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
06193   if (!Cmp.getNode())
06194     return SDValue();
06195 
06196   if (CC2 != AArch64CC::AL) {
06197     SDValue Cmp2 =
06198         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
06199     if (!Cmp2.getNode())
06200       return SDValue();
06201 
06202     Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
06203   }
06204 
06205   if (ShouldInvert)
06206     return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
06207 
06208   return Cmp;
06209 }
06210 
06211 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
06212 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
06213 /// specified in the intrinsic calls.
06214 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
06215                                                const CallInst &I,
06216                                                unsigned Intrinsic) const {
06217   switch (Intrinsic) {
06218   case Intrinsic::aarch64_neon_ld2:
06219   case Intrinsic::aarch64_neon_ld3:
06220   case Intrinsic::aarch64_neon_ld4:
06221   case Intrinsic::aarch64_neon_ld1x2:
06222   case Intrinsic::aarch64_neon_ld1x3:
06223   case Intrinsic::aarch64_neon_ld1x4:
06224   case Intrinsic::aarch64_neon_ld2lane:
06225   case Intrinsic::aarch64_neon_ld3lane:
06226   case Intrinsic::aarch64_neon_ld4lane:
06227   case Intrinsic::aarch64_neon_ld2r:
06228   case Intrinsic::aarch64_neon_ld3r:
06229   case Intrinsic::aarch64_neon_ld4r: {
06230     Info.opc = ISD::INTRINSIC_W_CHAIN;
06231     // Conservatively set memVT to the entire set of vectors loaded.
06232     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
06233     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
06234     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
06235     Info.offset = 0;
06236     Info.align = 0;
06237     Info.vol = false; // volatile loads with NEON intrinsics not supported
06238     Info.readMem = true;
06239     Info.writeMem = false;
06240     return true;
06241   }
06242   case Intrinsic::aarch64_neon_st2:
06243   case Intrinsic::aarch64_neon_st3:
06244   case Intrinsic::aarch64_neon_st4:
06245   case Intrinsic::aarch64_neon_st1x2:
06246   case Intrinsic::aarch64_neon_st1x3:
06247   case Intrinsic::aarch64_neon_st1x4:
06248   case Intrinsic::aarch64_neon_st2lane:
06249   case Intrinsic::aarch64_neon_st3lane:
06250   case Intrinsic::aarch64_neon_st4lane: {
06251     Info.opc = ISD::INTRINSIC_VOID;
06252     // Conservatively set memVT to the entire set of vectors stored.
06253     unsigned NumElts = 0;
06254     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
06255       Type *ArgTy = I.getArgOperand(ArgI)->getType();
06256       if (!ArgTy->isVectorTy())
06257         break;
06258       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
06259     }
06260     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
06261     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
06262     Info.offset = 0;
06263     Info.align = 0;
06264     Info.vol = false; // volatile stores with NEON intrinsics not supported
06265     Info.readMem = false;
06266     Info.writeMem = true;
06267     return true;
06268   }
06269   case Intrinsic::aarch64_ldaxr:
06270   case Intrinsic::aarch64_ldxr: {
06271     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
06272     Info.opc = ISD::INTRINSIC_W_CHAIN;
06273     Info.memVT = MVT::getVT(PtrTy->getElementType());
06274     Info.ptrVal = I.getArgOperand(0);
06275     Info.offset = 0;
06276     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
06277     Info.vol = true;
06278     Info.readMem = true;
06279     Info.writeMem = false;
06280     return true;
06281   }
06282   case Intrinsic::aarch64_stlxr:
06283   case Intrinsic::aarch64_stxr: {
06284     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
06285     Info.opc = ISD::INTRINSIC_W_CHAIN;
06286     Info.memVT = MVT::getVT(PtrTy->getElementType());
06287     Info.ptrVal = I.getArgOperand(1);
06288     Info.offset = 0;
06289     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
06290     Info.vol = true;
06291     Info.readMem = false;
06292     Info.writeMem = true;
06293     return true;
06294   }
06295   case Intrinsic::aarch64_ldaxp:
06296   case Intrinsic::aarch64_ldxp: {
06297     Info.opc = ISD::INTRINSIC_W_CHAIN;
06298     Info.memVT = MVT::i128;
06299     Info.ptrVal = I.getArgOperand(0);
06300     Info.offset = 0;
06301     Info.align = 16;
06302     Info.vol = true;
06303     Info.readMem = true;
06304     Info.writeMem = false;
06305     return true;
06306   }
06307   case Intrinsic::aarch64_stlxp:
06308   case Intrinsic::aarch64_stxp: {
06309     Info.opc = ISD::INTRINSIC_W_CHAIN;
06310     Info.memVT = MVT::i128;
06311     Info.ptrVal = I.getArgOperand(2);
06312     Info.offset = 0;
06313     Info.align = 16;
06314     Info.vol = true;
06315     Info.readMem = false;
06316     Info.writeMem = true;
06317     return true;
06318   }
06319   default:
06320     break;
06321   }
06322 
06323   return false;
06324 }
06325 
06326 // Truncations from 64-bit GPR to 32-bit GPR is free.
06327 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
06328   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
06329     return false;
06330   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
06331   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
06332   return NumBits1 > NumBits2;
06333 }
06334 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
06335   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
06336     return false;
06337   unsigned NumBits1 = VT1.getSizeInBits();
06338   unsigned NumBits2 = VT2.getSizeInBits();
06339   return NumBits1 > NumBits2;
06340 }
06341 
06342 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
06343 // 64-bit GPR.
06344 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
06345   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
06346     return false;
06347   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
06348   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
06349   return NumBits1 == 32 && NumBits2 == 64;
06350 }
06351 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
06352   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
06353     return false;
06354   unsigned NumBits1 = VT1.getSizeInBits();
06355   unsigned NumBits2 = VT2.getSizeInBits();
06356   return NumBits1 == 32 && NumBits2 == 64;
06357 }
06358 
06359 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
06360   EVT VT1 = Val.getValueType();
06361   if (isZExtFree(VT1, VT2)) {
06362     return true;
06363   }
06364 
06365   if (Val.getOpcode() != ISD::LOAD)
06366     return false;
06367 
06368   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
06369   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
06370           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
06371           VT1.getSizeInBits() <= 32);
06372 }
06373 
06374 bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
06375                                           unsigned &RequiredAligment) const {
06376   if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
06377     return false;
06378   // Cyclone supports unaligned accesses.
06379   RequiredAligment = 0;
06380   unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
06381   return NumBits == 32 || NumBits == 64;
06382 }
06383 
06384 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
06385                                           unsigned &RequiredAligment) const {
06386   if (!LoadedType.isSimple() ||
06387       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
06388     return false;
06389   // Cyclone supports unaligned accesses.
06390   RequiredAligment = 0;
06391   unsigned NumBits = LoadedType.getSizeInBits();
06392   return NumBits == 32 || NumBits == 64;
06393 }
06394 
06395 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
06396                        unsigned AlignCheck) {
06397   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
06398           (DstAlign == 0 || DstAlign % AlignCheck == 0));
06399 }
06400 
06401 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
06402                                                unsigned SrcAlign, bool IsMemset,
06403                                                bool ZeroMemset,
06404                                                bool MemcpyStrSrc,
06405                                                MachineFunction &MF) const {
06406   // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
06407   // instruction to materialize the v2i64 zero and one store (with restrictive
06408   // addressing mode). Just do two i64 store of zero-registers.
06409   bool Fast;
06410   const Function *F = MF.getFunction();
06411   if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
06412       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
06413                                        Attribute::NoImplicitFloat) &&
06414       (memOpAlign(SrcAlign, DstAlign, 16) ||
06415        (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
06416     return MVT::f128;
06417 
06418   return Size >= 8 ? MVT::i64 : MVT::i32;
06419 }
06420 
06421 // 12-bit optionally shifted immediates are legal for adds.
06422 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
06423   if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
06424     return true;
06425   return false;
06426 }
06427 
06428 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
06429 // immediates is the same as for an add or a sub.
06430 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
06431   if (Immed < 0)
06432     Immed *= -1;
06433   return isLegalAddImmediate(Immed);
06434 }
06435 
06436 /// isLegalAddressingMode - Return true if the addressing mode represented
06437 /// by AM is legal for this target, for a load/store of the specified type.
06438 bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
06439                                                   Type *Ty) const {
06440   // AArch64 has five basic addressing modes:
06441   //  reg
06442   //  reg + 9-bit signed offset
06443   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
06444   //  reg1 + reg2
06445   //  reg + SIZE_IN_BYTES * reg
06446 
06447   // No global is ever allowed as a base.
06448   if (AM.BaseGV)
06449     return false;
06450 
06451   // No reg+reg+imm addressing.
06452   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
06453     return false;
06454 
06455   // check reg + imm case:
06456   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
06457   uint64_t NumBytes = 0;
06458   if (Ty->isSized()) {
06459     uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
06460     NumBytes = NumBits / 8;
06461     if (!isPowerOf2_64(NumBits))
06462       NumBytes = 0;
06463   }
06464 
06465   if (!AM.Scale) {
06466     int64_t Offset = AM.BaseOffs;
06467 
06468     // 9-bit signed offset
06469     if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
06470       return true;
06471 
06472     // 12-bit unsigned offset
06473     unsigned shift = Log2_64(NumBytes);
06474     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
06475         // Must be a multiple of NumBytes (NumBytes is a power of 2)
06476         (Offset >> shift) << shift == Offset)
06477       return true;
06478     return false;
06479   }
06480 
06481   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
06482 
06483   if (!AM.Scale || AM.Scale == 1 ||
06484       (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
06485     return true;
06486   return false;
06487 }
06488 
06489 int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
06490                                                 Type *Ty) const {
06491   // Scaling factors are not free at all.
06492   // Operands                     | Rt Latency
06493   // -------------------------------------------
06494   // Rt, [Xn, Xm]                 | 4
06495   // -------------------------------------------
06496   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
06497   // Rt, [Xn, Wm, <extend> #imm]  |
06498   if (isLegalAddressingMode(AM, Ty))
06499     // Scale represents reg2 * scale, thus account for 1 if
06500     // it is not equal to 0 or 1.
06501     return AM.Scale != 0 && AM.Scale != 1;
06502   return -1;
06503 }
06504 
06505 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
06506   VT = VT.getScalarType();
06507 
06508   if (!VT.isSimple())
06509     return false;
06510 
06511   switch (VT.getSimpleVT().SimpleTy) {
06512   case MVT::f32:
06513   case MVT::f64:
06514     return true;
06515   default:
06516     break;
06517   }
06518 
06519   return false;
06520 }
06521 
06522 const MCPhysReg *
06523 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
06524   // LR is a callee-save register, but we must treat it as clobbered by any call
06525   // site. Hence we include LR in the scratch registers, which are in turn added
06526   // as implicit-defs for stackmaps and patchpoints.
06527   static const MCPhysReg ScratchRegs[] = {
06528     AArch64::X16, AArch64::X17, AArch64::LR, 0
06529   };
06530   return ScratchRegs;
06531 }
06532 
06533 bool
06534 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
06535   EVT VT = N->getValueType(0);
06536     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
06537     // it with shift to let it be lowered to UBFX.
06538   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
06539       isa<ConstantSDNode>(N->getOperand(1))) {
06540     uint64_t TruncMask = N->getConstantOperandVal(1);
06541     if (isMask_64(TruncMask) &&
06542       N->getOperand(0).getOpcode() == ISD::SRL &&
06543       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
06544       return false;
06545   }
06546   return true;
06547 }
06548 
06549 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
06550                                                               Type *Ty) const {
06551   assert(Ty->isIntegerTy());
06552 
06553   unsigned BitSize = Ty->getPrimitiveSizeInBits();
06554   if (BitSize == 0)
06555     return false;
06556 
06557   int64_t Val = Imm.getSExtValue();
06558   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
06559     return true;
06560 
06561   if ((int64_t)Val < 0)
06562     Val = ~Val;
06563   if (BitSize == 32)
06564     Val &= (1LL << 32) - 1;
06565 
06566   unsigned LZ = countLeadingZeros((uint64_t)Val);
06567   unsigned Shift = (63 - LZ) / 16;
06568   // MOVZ is free so return true for one or fewer MOVK.
06569   return (Shift < 3) ? true : false;
06570 }
06571 
06572 // Generate SUBS and CSEL for integer abs.
06573 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
06574   EVT VT = N->getValueType(0);
06575 
06576   SDValue N0 = N->getOperand(0);
06577   SDValue N1 = N->getOperand(1);
06578   SDLoc DL(N);
06579 
06580   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
06581   // and change it to SUB and CSEL.
06582   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
06583       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
06584       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
06585     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
06586       if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
06587         SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
06588                                   N0.getOperand(0));
06589         // Generate SUBS & CSEL.
06590         SDValue Cmp =
06591             DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
06592                         N0.getOperand(0), DAG.getConstant(0, VT));
06593         return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
06594                            DAG.getConstant(AArch64CC::PL, MVT::i32),
06595                            SDValue(Cmp.getNode(), 1));
06596       }
06597   return SDValue();
06598 }
06599 
06600 // performXorCombine - Attempts to handle integer ABS.
06601 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
06602                                  TargetLowering::DAGCombinerInfo &DCI,
06603                                  const AArch64Subtarget *Subtarget) {
06604   if (DCI.isBeforeLegalizeOps())
06605     return SDValue();
06606 
06607   return performIntegerAbsCombine(N, DAG);
06608 }
06609 
06610 SDValue
06611 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
06612                                      SelectionDAG &DAG,
06613                                      std::vector<SDNode *> *Created) const {
06614   // fold (sdiv X, pow2)
06615   EVT VT = N->getValueType(0);
06616   if ((VT != MVT::i32 && VT != MVT::i64) ||
06617       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
06618     return SDValue();
06619 
06620   SDLoc DL(N);
06621   SDValue N0 = N->getOperand(0);
06622   unsigned Lg2 = Divisor.countTrailingZeros();
06623   SDValue Zero = DAG.getConstant(0, VT);
06624   SDValue Pow2MinusOne = DAG.getConstant((1 << Lg2) - 1, VT);
06625 
06626   // Add (N0 < 0) ? Pow2 - 1 : 0;
06627   SDValue CCVal;
06628   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
06629   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
06630   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
06631 
06632   if (Created) {
06633     Created->push_back(Cmp.getNode());
06634     Created->push_back(Add.getNode());
06635     Created->push_back(CSel.getNode());
06636   }
06637 
06638   // Divide by pow2.
06639   SDValue SRA =
06640       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64));
06641 
06642   // If we're dividing by a positive value, we're done.  Otherwise, we must
06643   // negate the result.
06644   if (Divisor.isNonNegative())
06645     return SRA;
06646 
06647   if (Created)
06648     Created->push_back(SRA.getNode());
06649   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA);
06650 }
06651 
06652 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
06653                                  TargetLowering::DAGCombinerInfo &DCI,
06654                                  const AArch64Subtarget *Subtarget) {
06655   if (DCI.isBeforeLegalizeOps())
06656     return SDValue();
06657 
06658   // Multiplication of a power of two plus/minus one can be done more
06659   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
06660   // future CPUs have a cheaper MADD instruction, this may need to be
06661   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
06662   // 64-bit is 5 cycles, so this is always a win.
06663   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
06664     APInt Value = C->getAPIntValue();
06665     EVT VT = N->getValueType(0);
06666     if (Value.isNonNegative()) {
06667       // (mul x, 2^N + 1) => (add (shl x, N), x)
06668       APInt VM1 = Value - 1;
06669       if (VM1.isPowerOf2()) {
06670         SDValue ShiftedVal =
06671             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
06672                         DAG.getConstant(VM1.logBase2(), MVT::i64));
06673         return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
06674                            N->getOperand(0));
06675       }
06676       // (mul x, 2^N - 1) => (sub (shl x, N), x)
06677       APInt VP1 = Value + 1;
06678       if (VP1.isPowerOf2()) {
06679         SDValue ShiftedVal =
06680             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
06681                         DAG.getConstant(VP1.logBase2(), MVT::i64));
06682         return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
06683                            N->getOperand(0));
06684       }
06685     } else {
06686       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
06687       APInt VNM1 = -Value - 1;
06688       if (VNM1.isPowerOf2()) {
06689         SDValue ShiftedVal =
06690             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
06691                         DAG.getConstant(VNM1.logBase2(), MVT::i64));
06692         SDValue Add =
06693             DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
06694         return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
06695       }
06696       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
06697       APInt VNP1 = -Value + 1;
06698       if (VNP1.isPowerOf2()) {
06699         SDValue ShiftedVal =
06700             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
06701                         DAG.getConstant(VNP1.logBase2(), MVT::i64));
06702         return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
06703                            ShiftedVal);
06704       }
06705     }
06706   }
06707   return SDValue();
06708 }
06709 
06710 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
06711                                                          SelectionDAG &DAG) {
06712   // Take advantage of vector comparisons producing 0 or -1 in each lane to
06713   // optimize away operation when it's from a constant.
06714   //
06715   // The general transformation is:
06716   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
06717   //       AND(VECTOR_CMP(x,y), constant2)
06718   //    constant2 = UNARYOP(constant)
06719 
06720   // Early exit if this isn't a vector operation, the operand of the
06721   // unary operation isn't a bitwise AND, or if the sizes of the operations
06722   // aren't the same.
06723   EVT VT = N->getValueType(0);
06724   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
06725       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
06726       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
06727     return SDValue();
06728 
06729   // Now check that the other operand of the AND is a constant. We could
06730   // make the transformation for non-constant splats as well, but it's unclear
06731   // that would be a benefit as it would not eliminate any operations, just
06732   // perform one more step in scalar code before moving to the vector unit.
06733   if (BuildVectorSDNode *BV =
06734           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
06735     // Bail out if the vector isn't a constant.
06736     if (!BV->isConstant())
06737       return SDValue();
06738 
06739     // Everything checks out. Build up the new and improved node.
06740     SDLoc DL(N);
06741     EVT IntVT = BV->getValueType(0);
06742     // Create a new constant of the appropriate type for the transformed
06743     // DAG.
06744     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
06745     // The AND node needs bitcasts to/from an integer vector type around it.
06746     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
06747     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
06748                                  N->getOperand(0)->getOperand(0), MaskConst);
06749     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
06750     return Res;
06751   }
06752 
06753   return SDValue();
06754 }
06755 
06756 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
06757   // First try to optimize away the conversion when it's conditionally from
06758   // a constant. Vectors only.
06759   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
06760   if (Res != SDValue())
06761     return Res;
06762 
06763   EVT VT = N->getValueType(0);
06764   if (VT != MVT::f32 && VT != MVT::f64)
06765     return SDValue();
06766 
06767   // Only optimize when the source and destination types have the same width.
06768   if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
06769     return SDValue();
06770 
06771   // If the result of an integer load is only used by an integer-to-float
06772   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
06773   // This eliminates an "integer-to-vector-move UOP and improve throughput.
06774   SDValue N0 = N->getOperand(0);
06775   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
06776       // Do not change the width of a volatile load.
06777       !cast<LoadSDNode>(N0)->isVolatile()) {
06778     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
06779     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
06780                                LN0->getPointerInfo(), LN0->isVolatile(),
06781                                LN0->isNonTemporal(), LN0->isInvariant(),
06782                                LN0->getAlignment());
06783 
06784     // Make sure successors of the original load stay after it by updating them
06785     // to use the new Chain.
06786     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
06787 
06788     unsigned Opcode =
06789         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
06790     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
06791   }
06792 
06793   return SDValue();
06794 }
06795 
06796 /// An EXTR instruction is made up of two shifts, ORed together. This helper
06797 /// searches for and classifies those shifts.
06798 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
06799                          bool &FromHi) {
06800   if (N.getOpcode() == ISD::SHL)
06801     FromHi = false;
06802   else if (N.getOpcode() == ISD::SRL)
06803     FromHi = true;
06804   else
06805     return false;
06806 
06807   if (!isa<ConstantSDNode>(N.getOperand(1)))
06808     return false;
06809 
06810   ShiftAmount = N->getConstantOperandVal(1);
06811   Src = N->getOperand(0);
06812   return true;
06813 }
06814 
06815 /// EXTR instruction extracts a contiguous chunk of bits from two existing
06816 /// registers viewed as a high/low pair. This function looks for the pattern:
06817 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
06818 /// EXTR. Can't quite be done in TableGen because the two immediates aren't
06819 /// independent.
06820 static SDValue tryCombineToEXTR(SDNode *N,
06821                                 TargetLowering::DAGCombinerInfo &DCI) {
06822   SelectionDAG &DAG = DCI.DAG;
06823   SDLoc DL(N);
06824   EVT VT = N->getValueType(0);
06825 
06826   assert(N->getOpcode() == ISD::OR && "Unexpected root");
06827 
06828   if (VT != MVT::i32 && VT != MVT::i64)
06829     return SDValue();
06830 
06831   SDValue LHS;
06832   uint32_t ShiftLHS = 0;
06833   bool LHSFromHi = 0;
06834   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
06835     return SDValue();
06836 
06837   SDValue RHS;
06838   uint32_t ShiftRHS = 0;
06839   bool RHSFromHi = 0;
06840   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
06841     return SDValue();
06842 
06843   // If they're both trying to come from the high part of the register, they're
06844   // not really an EXTR.
06845   if (LHSFromHi == RHSFromHi)
06846     return SDValue();
06847 
06848   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
06849     return SDValue();
06850 
06851   if (LHSFromHi) {
06852     std::swap(LHS, RHS);
06853     std::swap(ShiftLHS, ShiftRHS);
06854   }
06855 
06856   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
06857                      DAG.getConstant(ShiftRHS, MVT::i64));
06858 }
06859 
06860 static SDValue tryCombineToBSL(SDNode *N,
06861                                 TargetLowering::DAGCombinerInfo &DCI) {
06862   EVT VT = N->getValueType(0);
06863   SelectionDAG &DAG = DCI.DAG;
06864   SDLoc DL(N);
06865 
06866   if (!VT.isVector())
06867     return SDValue();
06868 
06869   SDValue N0 = N->getOperand(0);
06870   if (N0.getOpcode() != ISD::AND)
06871     return SDValue();
06872 
06873   SDValue N1 = N->getOperand(1);
06874   if (N1.getOpcode() != ISD::AND)
06875     return SDValue();
06876 
06877   // We only have to look for constant vectors here since the general, variable
06878   // case can be handled in TableGen.
06879   unsigned Bits = VT.getVectorElementType().getSizeInBits();
06880   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
06881   for (int i = 1; i >= 0; --i)
06882     for (int j = 1; j >= 0; --j) {
06883       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
06884       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
06885       if (!BVN0 || !BVN1)
06886         continue;
06887 
06888       bool FoundMatch = true;
06889       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
06890         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
06891         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
06892         if (!CN0 || !CN1 ||
06893             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
06894           FoundMatch = false;
06895           break;
06896         }
06897       }
06898 
06899       if (FoundMatch)
06900         return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
06901                            N0->getOperand(1 - i), N1->getOperand(1 - j));
06902     }
06903 
06904   return SDValue();
06905 }
06906 
06907 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
06908                                 const AArch64Subtarget *Subtarget) {
06909   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
06910   if (!EnableAArch64ExtrGeneration)
06911     return SDValue();
06912   SelectionDAG &DAG = DCI.DAG;
06913   EVT VT = N->getValueType(0);
06914 
06915   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
06916     return SDValue();
06917 
06918   SDValue Res = tryCombineToEXTR(N, DCI);
06919   if (Res.getNode())
06920     return Res;
06921 
06922   Res = tryCombineToBSL(N, DCI);
06923   if (Res.getNode())
06924     return Res;
06925 
06926   return SDValue();
06927 }
06928 
06929 static SDValue performBitcastCombine(SDNode *N,
06930                                      TargetLowering::DAGCombinerInfo &DCI,
06931                                      SelectionDAG &DAG) {
06932   // Wait 'til after everything is legalized to try this. That way we have
06933   // legal vector types and such.
06934   if (DCI.isBeforeLegalizeOps())
06935     return SDValue();
06936 
06937   // Remove extraneous bitcasts around an extract_subvector.
06938   // For example,
06939   //    (v4i16 (bitconvert
06940   //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
06941   //  becomes
06942   //    (extract_subvector ((v8i16 ...), (i64 4)))
06943 
06944   // Only interested in 64-bit vectors as the ultimate result.
06945   EVT VT = N->getValueType(0);
06946   if (!VT.isVector())
06947     return SDValue();
06948   if (VT.getSimpleVT().getSizeInBits() != 64)
06949     return SDValue();
06950   // Is the operand an extract_subvector starting at the beginning or halfway
06951   // point of the vector? A low half may also come through as an
06952   // EXTRACT_SUBREG, so look for that, too.
06953   SDValue Op0 = N->getOperand(0);
06954   if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
06955       !(Op0->isMachineOpcode() &&
06956         Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
06957     return SDValue();
06958   uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
06959   if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
06960     if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
06961       return SDValue();
06962   } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
06963     if (idx != AArch64::dsub)
06964       return SDValue();
06965     // The dsub reference is equivalent to a lane zero subvector reference.
06966     idx = 0;
06967   }
06968   // Look through the bitcast of the input to the extract.
06969   if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
06970     return SDValue();
06971   SDValue Source = Op0->getOperand(0)->getOperand(0);
06972   // If the source type has twice the number of elements as our destination
06973   // type, we know this is an extract of the high or low half of the vector.
06974   EVT SVT = Source->getValueType(0);
06975   if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
06976     return SDValue();
06977 
06978   DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
06979 
06980   // Create the simplified form to just extract the low or high half of the
06981   // vector directly rather than bothering with the bitcasts.
06982   SDLoc dl(N);
06983   unsigned NumElements = VT.getVectorNumElements();
06984   if (idx) {
06985     SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
06986     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
06987   } else {
06988     SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
06989     return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
06990                                       Source, SubReg),
06991                    0);
06992   }
06993 }
06994 
06995 static SDValue performConcatVectorsCombine(SDNode *N,
06996                                            TargetLowering::DAGCombinerInfo &DCI,
06997                                            SelectionDAG &DAG) {
06998   // Wait 'til after everything is legalized to try this. That way we have
06999   // legal vector types and such.
07000   if (DCI.isBeforeLegalizeOps())
07001     return SDValue();
07002 
07003   SDLoc dl(N);
07004   EVT VT = N->getValueType(0);
07005 
07006   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
07007   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
07008   // canonicalise to that.
07009   if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
07010     assert(VT.getVectorElementType().getSizeInBits() == 64);
07011     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
07012                        WidenVector(N->getOperand(0), DAG),
07013                        DAG.getConstant(0, MVT::i64));
07014   }
07015 
07016   // Canonicalise concat_vectors so that the right-hand vector has as few
07017   // bit-casts as possible before its real operation. The primary matching
07018   // destination for these operations will be the narrowing "2" instructions,
07019   // which depend on the operation being performed on this right-hand vector.
07020   // For example,
07021   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
07022   // becomes
07023   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
07024 
07025   SDValue Op1 = N->getOperand(1);
07026   if (Op1->getOpcode() != ISD::BITCAST)
07027     return SDValue();
07028   SDValue RHS = Op1->getOperand(0);
07029   MVT RHSTy = RHS.getValueType().getSimpleVT();
07030   // If the RHS is not a vector, this is not the pattern we're looking for.
07031   if (!RHSTy.isVector())
07032     return SDValue();
07033 
07034   DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
07035 
07036   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
07037                                   RHSTy.getVectorNumElements() * 2);
07038   return DAG.getNode(
07039       ISD::BITCAST, dl, VT,
07040       DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
07041                   DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
07042 }
07043 
07044 static SDValue tryCombineFixedPointConvert(SDNode *N,
07045                                            TargetLowering::DAGCombinerInfo &DCI,
07046                                            SelectionDAG &DAG) {
07047   // Wait 'til after everything is legalized to try this. That way we have
07048   // legal vector types and such.
07049   if (DCI.isBeforeLegalizeOps())
07050     return SDValue();
07051   // Transform a scalar conversion of a value from a lane extract into a
07052   // lane extract of a vector conversion. E.g., from foo1 to foo2:
07053   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
07054   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
07055   //
07056   // The second form interacts better with instruction selection and the
07057   // register allocator to avoid cross-class register copies that aren't
07058   // coalescable due to a lane reference.
07059 
07060   // Check the operand and see if it originates from a lane extract.
07061   SDValue Op1 = N->getOperand(1);
07062   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
07063     // Yep, no additional predication needed. Perform the transform.
07064     SDValue IID = N->getOperand(0);
07065     SDValue Shift = N->getOperand(2);
07066     SDValue Vec = Op1.getOperand(0);
07067     SDValue Lane = Op1.getOperand(1);
07068     EVT ResTy = N->getValueType(0);
07069     EVT VecResTy;
07070     SDLoc DL(N);
07071 
07072     // The vector width should be 128 bits by the time we get here, even
07073     // if it started as 64 bits (the extract_vector handling will have
07074     // done so).
07075     assert(Vec.getValueType().getSizeInBits() == 128 &&
07076            "unexpected vector size on extract_vector_elt!");
07077     if (Vec.getValueType() == MVT::v4i32)
07078       VecResTy = MVT::v4f32;
07079     else if (Vec.getValueType() == MVT::v2i64)
07080       VecResTy = MVT::v2f64;
07081     else
07082       llvm_unreachable("unexpected vector type!");
07083 
07084     SDValue Convert =
07085         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
07086     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
07087   }
07088   return SDValue();
07089 }
07090 
07091 // AArch64 high-vector "long" operations are formed by performing the non-high
07092 // version on an extract_subvector of each operand which gets the high half:
07093 //
07094 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
07095 //
07096 // However, there are cases which don't have an extract_high explicitly, but
07097 // have another operation that can be made compatible with one for free. For
07098 // example:
07099 //
07100 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
07101 //
07102 // This routine does the actual conversion of such DUPs, once outer routines
07103 // have determined that everything else is in order.
07104 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
07105   // We can handle most types of duplicate, but the lane ones have an extra
07106   // operand saying *which* lane, so we need to know.
07107   bool IsDUPLANE;
07108   switch (N.getOpcode()) {
07109   case AArch64ISD::DUP:
07110     IsDUPLANE = false;
07111     break;
07112   case AArch64ISD::DUPLANE8:
07113   case AArch64ISD::DUPLANE16:
07114   case AArch64ISD::DUPLANE32:
07115   case AArch64ISD::DUPLANE64:
07116     IsDUPLANE = true;
07117     break;
07118   default:
07119     return SDValue();
07120   }
07121 
07122   MVT NarrowTy = N.getSimpleValueType();
07123   if (!NarrowTy.is64BitVector())
07124     return SDValue();
07125 
07126   MVT ElementTy = NarrowTy.getVectorElementType();
07127   unsigned NumElems = NarrowTy.getVectorNumElements();
07128   MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
07129 
07130   SDValue NewDUP;
07131   if (IsDUPLANE)
07132     NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
07133                          N.getOperand(1));
07134   else
07135     NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
07136 
07137   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
07138                      NewDUP, DAG.getConstant(NumElems, MVT::i64));
07139 }
07140 
07141 static bool isEssentiallyExtractSubvector(SDValue N) {
07142   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
07143     return true;
07144 
07145   return N.getOpcode() == ISD::BITCAST &&
07146          N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
07147 }
07148 
07149 /// \brief Helper structure to keep track of ISD::SET_CC operands.
07150 struct GenericSetCCInfo {
07151   const SDValue *Opnd0;
07152   const SDValue *Opnd1;
07153   ISD::CondCode CC;
07154 };
07155 
07156 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
07157 struct AArch64SetCCInfo {
07158   const SDValue *Cmp;
07159   AArch64CC::CondCode CC;
07160 };
07161 
07162 /// \brief Helper structure to keep track of SetCC information.
07163 union SetCCInfo {
07164   GenericSetCCInfo Generic;
07165   AArch64SetCCInfo AArch64;
07166 };
07167 
07168 /// \brief Helper structure to be able to read SetCC information.  If set to
07169 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
07170 /// GenericSetCCInfo.
07171 struct SetCCInfoAndKind {
07172   SetCCInfo Info;
07173   bool IsAArch64;
07174 };
07175 
07176 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
07177 /// an
07178 /// AArch64 lowered one.
07179 /// \p SetCCInfo is filled accordingly.
07180 /// \post SetCCInfo is meanginfull only when this function returns true.
07181 /// \return True when Op is a kind of SET_CC operation.
07182 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
07183   // If this is a setcc, this is straight forward.
07184   if (Op.getOpcode() == ISD::SETCC) {
07185     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
07186     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
07187     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
07188     SetCCInfo.IsAArch64 = false;
07189     return true;
07190   }
07191   // Otherwise, check if this is a matching csel instruction.
07192   // In other words:
07193   // - csel 1, 0, cc
07194   // - csel 0, 1, !cc
07195   if (Op.getOpcode() != AArch64ISD::CSEL)
07196     return false;
07197   // Set the information about the operands.
07198   // TODO: we want the operands of the Cmp not the csel
07199   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
07200   SetCCInfo.IsAArch64 = true;
07201   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
07202       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
07203 
07204   // Check that the operands matches the constraints:
07205   // (1) Both operands must be constants.
07206   // (2) One must be 1 and the other must be 0.
07207   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
07208   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
07209 
07210   // Check (1).
07211   if (!TValue || !FValue)
07212     return false;
07213 
07214   // Check (2).
07215   if (!TValue->isOne()) {
07216     // Update the comparison when we are interested in !cc.
07217     std::swap(TValue, FValue);
07218     SetCCInfo.Info.AArch64.CC =
07219         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
07220   }
07221   return TValue->isOne() && FValue->isNullValue();
07222 }
07223 
07224 // Returns true if Op is setcc or zext of setcc.
07225 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
07226   if (isSetCC(Op, Info))
07227     return true;
07228   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
07229     isSetCC(Op->getOperand(0), Info));
07230 }
07231 
07232 // The folding we want to perform is:
07233 // (add x, [zext] (setcc cc ...) )
07234 //   -->
07235 // (csel x, (add x, 1), !cc ...)
07236 //
07237 // The latter will get matched to a CSINC instruction.
07238 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
07239   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
07240   SDValue LHS = Op->getOperand(0);
07241   SDValue RHS = Op->getOperand(1);
07242   SetCCInfoAndKind InfoAndKind;
07243 
07244   // If neither operand is a SET_CC, give up.
07245   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
07246     std::swap(LHS, RHS);
07247     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
07248       return SDValue();
07249   }
07250 
07251   // FIXME: This could be generatized to work for FP comparisons.
07252   EVT CmpVT = InfoAndKind.IsAArch64
07253                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
07254                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
07255   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
07256     return SDValue();
07257 
07258   SDValue CCVal;
07259   SDValue Cmp;
07260   SDLoc dl(Op);
07261   if (InfoAndKind.IsAArch64) {
07262     CCVal = DAG.getConstant(
07263         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
07264     Cmp = *InfoAndKind.Info.AArch64.Cmp;
07265   } else
07266     Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
07267                       *InfoAndKind.Info.Generic.Opnd1,
07268                       ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
07269                       CCVal, DAG, dl);
07270 
07271   EVT VT = Op->getValueType(0);
07272   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
07273   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
07274 }
07275 
07276 // The basic add/sub long vector instructions have variants with "2" on the end
07277 // which act on the high-half of their inputs. They are normally matched by
07278 // patterns like:
07279 //
07280 // (add (zeroext (extract_high LHS)),
07281 //      (zeroext (extract_high RHS)))
07282 // -> uaddl2 vD, vN, vM
07283 //
07284 // However, if one of the extracts is something like a duplicate, this
07285 // instruction can still be used profitably. This function puts the DAG into a
07286 // more appropriate form for those patterns to trigger.
07287 static SDValue performAddSubLongCombine(SDNode *N,
07288                                         TargetLowering::DAGCombinerInfo &DCI,
07289                                         SelectionDAG &DAG) {
07290   if (DCI.isBeforeLegalizeOps())
07291     return SDValue();
07292 
07293   MVT VT = N->getSimpleValueType(0);
07294   if (!VT.is128BitVector()) {
07295     if (N->getOpcode() == ISD::ADD)
07296       return performSetccAddFolding(N, DAG);
07297     return SDValue();
07298   }
07299 
07300   // Make sure both branches are extended in the same way.
07301   SDValue LHS = N->getOperand(0);
07302   SDValue RHS = N->getOperand(1);
07303   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
07304        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
07305       LHS.getOpcode() != RHS.getOpcode())
07306     return SDValue();
07307 
07308   unsigned ExtType = LHS.getOpcode();
07309 
07310   // It's not worth doing if at least one of the inputs isn't already an
07311   // extract, but we don't know which it'll be so we have to try both.
07312   if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
07313     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
07314     if (!RHS.getNode())
07315       return SDValue();
07316 
07317     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
07318   } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
07319     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
07320     if (!LHS.getNode())
07321       return SDValue();
07322 
07323     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
07324   }
07325 
07326   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
07327 }
07328 
07329 // Massage DAGs which we can use the high-half "long" operations on into
07330 // something isel will recognize better. E.g.
07331 //
07332 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
07333 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
07334 //                     (extract_high (v2i64 (dup128 scalar)))))
07335 //
07336 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
07337                                        TargetLowering::DAGCombinerInfo &DCI,
07338                                        SelectionDAG &DAG) {
07339   if (DCI.isBeforeLegalizeOps())
07340     return SDValue();
07341 
07342   SDValue LHS = N->getOperand(1);
07343   SDValue RHS = N->getOperand(2);
07344   assert(LHS.getValueType().is64BitVector() &&
07345          RHS.getValueType().is64BitVector() &&
07346          "unexpected shape for long operation");
07347 
07348   // Either node could be a DUP, but it's not worth doing both of them (you'd
07349   // just as well use the non-high version) so look for a corresponding extract
07350   // operation on the other "wing".
07351   if (isEssentiallyExtractSubvector(LHS)) {
07352     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
07353     if (!RHS.getNode())
07354       return SDValue();
07355   } else if (isEssentiallyExtractSubvector(RHS)) {
07356     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
07357     if (!LHS.getNode())
07358       return SDValue();
07359   }
07360 
07361   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
07362                      N->getOperand(0), LHS, RHS);
07363 }
07364 
07365 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
07366   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
07367   unsigned ElemBits = ElemTy.getSizeInBits();
07368 
07369   int64_t ShiftAmount;
07370   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
07371     APInt SplatValue, SplatUndef;
07372     unsigned SplatBitSize;
07373     bool HasAnyUndefs;
07374     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
07375                               HasAnyUndefs, ElemBits) ||
07376         SplatBitSize != ElemBits)
07377       return SDValue();
07378 
07379     ShiftAmount = SplatValue.getSExtValue();
07380   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
07381     ShiftAmount = CVN->getSExtValue();
07382   } else
07383     return SDValue();
07384 
07385   unsigned Opcode;
07386   bool IsRightShift;
07387   switch (IID) {
07388   default:
07389     llvm_unreachable("Unknown shift intrinsic");
07390   case Intrinsic::aarch64_neon_sqshl:
07391     Opcode = AArch64ISD::SQSHL_I;
07392     IsRightShift = false;
07393     break;
07394   case Intrinsic::aarch64_neon_uqshl:
07395     Opcode = AArch64ISD::UQSHL_I;
07396     IsRightShift = false;
07397     break;
07398   case Intrinsic::aarch64_neon_srshl:
07399     Opcode = AArch64ISD::SRSHR_I;
07400     IsRightShift = true;
07401     break;
07402   case Intrinsic::aarch64_neon_urshl:
07403     Opcode = AArch64ISD::URSHR_I;
07404     IsRightShift = true;
07405     break;
07406   case Intrinsic::aarch64_neon_sqshlu:
07407     Opcode = AArch64ISD::SQSHLU_I;
07408     IsRightShift = false;
07409     break;
07410   }
07411 
07412   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
07413     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
07414                        DAG.getConstant(-ShiftAmount, MVT::i32));
07415   else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
07416     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
07417                        DAG.getConstant(ShiftAmount, MVT::i32));
07418 
07419   return SDValue();
07420 }
07421 
07422 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
07423 // the intrinsics must be legal and take an i32, this means there's almost
07424 // certainly going to be a zext in the DAG which we can eliminate.
07425 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
07426   SDValue AndN = N->getOperand(2);
07427   if (AndN.getOpcode() != ISD::AND)
07428     return SDValue();
07429 
07430   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
07431   if (!CMask || CMask->getZExtValue() != Mask)
07432     return SDValue();
07433 
07434   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
07435                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
07436 }
07437 
07438 static SDValue performIntrinsicCombine(SDNode *N,
07439                                        TargetLowering::DAGCombinerInfo &DCI,
07440                                        const AArch64Subtarget *Subtarget) {
07441   SelectionDAG &DAG = DCI.DAG;
07442   unsigned IID = getIntrinsicID(N);
07443   switch (IID) {
07444   default:
07445     break;
07446   case Intrinsic::aarch64_neon_vcvtfxs2fp:
07447   case Intrinsic::aarch64_neon_vcvtfxu2fp:
07448     return tryCombineFixedPointConvert(N, DCI, DAG);
07449     break;
07450   case Intrinsic::aarch64_neon_fmax:
07451     return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
07452                        N->getOperand(1), N->getOperand(2));
07453   case Intrinsic::aarch64_neon_fmin:
07454     return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
07455                        N->getOperand(1), N->getOperand(2));
07456   case Intrinsic::aarch64_neon_smull:
07457   case Intrinsic::aarch64_neon_umull:
07458   case Intrinsic::aarch64_neon_pmull:
07459   case Intrinsic::aarch64_neon_sqdmull:
07460     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
07461   case Intrinsic::aarch64_neon_sqshl:
07462   case Intrinsic::aarch64_neon_uqshl:
07463   case Intrinsic::aarch64_neon_sqshlu:
07464   case Intrinsic::aarch64_neon_srshl:
07465   case Intrinsic::aarch64_neon_urshl:
07466     return tryCombineShiftImm(IID, N, DAG);
07467   case Intrinsic::aarch64_crc32b:
07468   case Intrinsic::aarch64_crc32cb:
07469     return tryCombineCRC32(0xff, N, DAG);
07470   case Intrinsic::aarch64_crc32h:
07471   case Intrinsic::aarch64_crc32ch:
07472     return tryCombineCRC32(0xffff, N, DAG);
07473   }
07474   return SDValue();
07475 }
07476 
07477 static SDValue performExtendCombine(SDNode *N,
07478                                     TargetLowering::DAGCombinerInfo &DCI,
07479                                     SelectionDAG &DAG) {
07480   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
07481   // we can convert that DUP into another extract_high (of a bigger DUP), which
07482   // helps the backend to decide that an sabdl2 would be useful, saving a real
07483   // extract_high operation.
07484   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
07485       N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
07486     SDNode *ABDNode = N->getOperand(0).getNode();
07487     unsigned IID = getIntrinsicID(ABDNode);
07488     if (IID == Intrinsic::aarch64_neon_sabd ||
07489         IID == Intrinsic::aarch64_neon_uabd) {
07490       SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
07491       if (!NewABD.getNode())
07492         return SDValue();
07493 
07494       return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
07495                          NewABD);
07496     }
07497   }
07498 
07499   // This is effectively a custom type legalization for AArch64.
07500   //
07501   // Type legalization will split an extend of a small, legal, type to a larger
07502   // illegal type by first splitting the destination type, often creating
07503   // illegal source types, which then get legalized in isel-confusing ways,
07504   // leading to really terrible codegen. E.g.,
07505   //   %result = v8i32 sext v8i8 %value
07506   // becomes
07507   //   %losrc = extract_subreg %value, ...
07508   //   %hisrc = extract_subreg %value, ...
07509   //   %lo = v4i32 sext v4i8 %losrc
07510   //   %hi = v4i32 sext v4i8 %hisrc
07511   // Things go rapidly downhill from there.
07512   //
07513   // For AArch64, the [sz]ext vector instructions can only go up one element
07514   // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
07515   // take two instructions.
07516   //
07517   // This implies that the most efficient way to do the extend from v8i8
07518   // to two v4i32 values is to first extend the v8i8 to v8i16, then do
07519   // the normal splitting to happen for the v8i16->v8i32.
07520 
07521   // This is pre-legalization to catch some cases where the default
07522   // type legalization will create ill-tempered code.
07523   if (!DCI.isBeforeLegalizeOps())
07524     return SDValue();
07525 
07526   // We're only interested in cleaning things up for non-legal vector types
07527   // here. If both the source and destination are legal, things will just
07528   // work naturally without any fiddling.
07529   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
07530   EVT ResVT = N->getValueType(0);
07531   if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
07532     return SDValue();
07533   // If the vector type isn't a simple VT, it's beyond the scope of what
07534   // we're  worried about here. Let legalization do its thing and hope for
07535   // the best.
07536   SDValue Src = N->getOperand(0);
07537   EVT SrcVT = Src->getValueType(0);
07538   if (!ResVT.isSimple() || !SrcVT.isSimple())
07539     return SDValue();
07540 
07541   // If the source VT is a 64-bit vector, we can play games and get the
07542   // better results we want.
07543   if (SrcVT.getSizeInBits() != 64)
07544     return SDValue();
07545 
07546   unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
07547   unsigned ElementCount = SrcVT.getVectorNumElements();
07548   SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
07549   SDLoc DL(N);
07550   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
07551 
07552   // Now split the rest of the operation into two halves, each with a 64
07553   // bit source.
07554   EVT LoVT, HiVT;
07555   SDValue Lo, Hi;
07556   unsigned NumElements = ResVT.getVectorNumElements();
07557   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
07558   LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
07559                                  ResVT.getVectorElementType(), NumElements / 2);
07560 
07561   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
07562                                LoVT.getVectorNumElements());
07563   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
07564                    DAG.getIntPtrConstant(0));
07565   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
07566                    DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
07567   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
07568   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
07569 
07570   // Now combine the parts back together so we still have a single result
07571   // like the combiner expects.
07572   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
07573 }
07574 
07575 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
07576 /// value. The load store optimizer pass will merge them to store pair stores.
07577 /// This has better performance than a splat of the scalar followed by a split
07578 /// vector store. Even if the stores are not merged it is four stores vs a dup,
07579 /// followed by an ext.b and two stores.
07580 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
07581   SDValue StVal = St->getValue();
07582   EVT VT = StVal.getValueType();
07583 
07584   // Don't replace floating point stores, they possibly won't be transformed to
07585   // stp because of the store pair suppress pass.
07586   if (VT.isFloatingPoint())
07587     return SDValue();
07588 
07589   // Check for insert vector elements.
07590   if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
07591     return SDValue();
07592 
07593   // We can express a splat as store pair(s) for 2 or 4 elements.
07594   unsigned NumVecElts = VT.getVectorNumElements();
07595   if (NumVecElts != 4 && NumVecElts != 2)
07596     return SDValue();
07597   SDValue SplatVal = StVal.getOperand(1);
07598   unsigned RemainInsertElts = NumVecElts - 1;
07599 
07600   // Check that this is a splat.
07601   while (--RemainInsertElts) {
07602     SDValue NextInsertElt = StVal.getOperand(0);
07603     if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
07604       return SDValue();
07605     if (NextInsertElt.getOperand(1) != SplatVal)
07606       return SDValue();
07607     StVal = NextInsertElt;
07608   }
07609   unsigned OrigAlignment = St->getAlignment();
07610   unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
07611   unsigned Alignment = std::min(OrigAlignment, EltOffset);
07612 
07613   // Create scalar stores. This is at least as good as the code sequence for a
07614   // split unaligned store wich is a dup.s, ext.b, and two stores.
07615   // Most of the time the three stores should be replaced by store pair
07616   // instructions (stp).
07617   SDLoc DL(St);
07618   SDValue BasePtr = St->getBasePtr();
07619   SDValue NewST1 =
07620       DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
07621                    St->isVolatile(), St->isNonTemporal(), St->getAlignment());
07622 
07623   unsigned Offset = EltOffset;
07624   while (--NumVecElts) {
07625     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
07626                                     DAG.getConstant(Offset, MVT::i64));
07627     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
07628                           St->getPointerInfo(), St->isVolatile(),
07629                           St->isNonTemporal(), Alignment);
07630     Offset += EltOffset;
07631   }
07632   return NewST1;
07633 }
07634 
07635 static SDValue performSTORECombine(SDNode *N,
07636                                    TargetLowering::DAGCombinerInfo &DCI,
07637                                    SelectionDAG &DAG,
07638                                    const AArch64Subtarget *Subtarget) {
07639   if (!DCI.isBeforeLegalize())
07640     return SDValue();
07641 
07642   StoreSDNode *S = cast<StoreSDNode>(N);
07643   if (S->isVolatile())
07644     return SDValue();
07645 
07646   // Cyclone has bad performance on unaligned 16B stores when crossing line and
07647   // page boundries. We want to split such stores.
07648   if (!Subtarget->isCyclone())
07649     return SDValue();
07650 
07651   // Don't split at Oz.
07652   MachineFunction &MF = DAG.getMachineFunction();
07653   bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
07654       AttributeSet::FunctionIndex, Attribute::MinSize);
07655   if (IsMinSize)
07656     return SDValue();
07657 
07658   SDValue StVal = S->getValue();
07659   EVT VT = StVal.getValueType();
07660 
07661   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
07662   // those up regresses performance on micro-benchmarks and olden/bh.
07663   if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
07664     return SDValue();
07665 
07666   // Split unaligned 16B stores. They are terrible for performance.
07667   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
07668   // extensions can use this to mark that it does not want splitting to happen
07669   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
07670   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
07671   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
07672       S->getAlignment() <= 2)
07673     return SDValue();
07674 
07675   // If we get a splat of a scalar convert this vector store to a store of
07676   // scalars. They will be merged into store pairs thereby removing two
07677   // instructions.
07678   SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
07679   if (ReplacedSplat != SDValue())
07680     return ReplacedSplat;
07681 
07682   SDLoc DL(S);
07683   unsigned NumElts = VT.getVectorNumElements() / 2;
07684   // Split VT into two.
07685   EVT HalfVT =
07686       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
07687   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
07688                                    DAG.getIntPtrConstant(0));
07689   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
07690                                    DAG.getIntPtrConstant(NumElts));
07691   SDValue BasePtr = S->getBasePtr();
07692   SDValue NewST1 =
07693       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
07694                    S->isVolatile(), S->isNonTemporal(), S->getAlignment());
07695   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
07696                                   DAG.getConstant(8, MVT::i64));
07697   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
07698                       S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
07699                       S->getAlignment());
07700 }
07701 
07702 /// Target-specific DAG combine function for post-increment LD1 (lane) and
07703 /// post-increment LD1R.
07704 static SDValue performPostLD1Combine(SDNode *N,
07705                                      TargetLowering::DAGCombinerInfo &DCI,
07706                                      bool IsLaneOp) {
07707   if (DCI.isBeforeLegalizeOps())
07708     return SDValue();
07709 
07710   SelectionDAG &DAG = DCI.DAG;
07711   EVT VT = N->getValueType(0);
07712 
07713   unsigned LoadIdx = IsLaneOp ? 1 : 0;
07714   SDNode *LD = N->getOperand(LoadIdx).getNode();
07715   // If it is not LOAD, can not do such combine.
07716   if (LD->getOpcode() != ISD::LOAD)
07717     return SDValue();
07718 
07719   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
07720   EVT MemVT = LoadSDN->getMemoryVT();
07721   // Check if memory operand is the same type as the vector element.
07722   if (MemVT != VT.getVectorElementType())
07723     return SDValue();
07724 
07725   // Check if there are other uses. If so, do not combine as it will introduce
07726   // an extra load.
07727   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
07728        ++UI) {
07729     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
07730       continue;
07731     if (*UI != N)
07732       return SDValue();
07733   }
07734 
07735   SDValue Addr = LD->getOperand(1);
07736   SDValue Vector = N->getOperand(0);
07737   // Search for a use of the address operand that is an increment.
07738   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
07739        Addr.getNode()->use_end(); UI != UE; ++UI) {
07740     SDNode *User = *UI;
07741     if (User->getOpcode() != ISD::ADD
07742         || UI.getUse().getResNo() != Addr.getResNo())
07743       continue;
07744 
07745     // Check that the add is independent of the load.  Otherwise, folding it
07746     // would create a cycle.
07747     if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
07748       continue;
07749     // Also check that add is not used in the vector operand.  This would also
07750     // create a cycle.
07751     if (User->isPredecessorOf(Vector.getNode()))
07752       continue;
07753 
07754     // If the increment is a constant, it must match the memory ref size.
07755     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
07756     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
07757       uint32_t IncVal = CInc->getZExtValue();
07758       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
07759       if (IncVal != NumBytes)
07760         continue;
07761       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
07762     }
07763 
07764     SmallVector<SDValue, 8> Ops;
07765     Ops.push_back(LD->getOperand(0));  // Chain
07766     if (IsLaneOp) {
07767       Ops.push_back(Vector);           // The vector to be inserted
07768       Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
07769     }
07770     Ops.push_back(Addr);
07771     Ops.push_back(Inc);
07772 
07773     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
07774     SDVTList SDTys = DAG.getVTList(Tys);
07775     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
07776     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
07777                                            MemVT,
07778                                            LoadSDN->getMemOperand());
07779 
07780     // Update the uses.
07781     std::vector<SDValue> NewResults;
07782     NewResults.push_back(SDValue(LD, 0));             // The result of load
07783     NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
07784     DCI.CombineTo(LD, NewResults);
07785     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
07786     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
07787 
07788     break;
07789   }
07790   return SDValue();
07791 }
07792 
07793 /// Target-specific DAG combine function for NEON load/store intrinsics
07794 /// to merge base address updates.
07795 static SDValue performNEONPostLDSTCombine(SDNode *N,
07796                                           TargetLowering::DAGCombinerInfo &DCI,
07797                                           SelectionDAG &DAG) {
07798   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
07799     return SDValue();
07800 
07801   unsigned AddrOpIdx = N->getNumOperands() - 1;
07802   SDValue Addr = N->getOperand(AddrOpIdx);
07803 
07804   // Search for a use of the address operand that is an increment.
07805   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
07806        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
07807     SDNode *User = *UI;
07808     if (User->getOpcode() != ISD::ADD ||
07809         UI.getUse().getResNo() != Addr.getResNo())
07810       continue;
07811 
07812     // Check that the add is independent of the load/store.  Otherwise, folding
07813     // it would create a cycle.
07814     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
07815       continue;
07816 
07817     // Find the new opcode for the updating load/store.
07818     bool IsStore = false;
07819     bool IsLaneOp = false;
07820     bool IsDupOp = false;
07821     unsigned NewOpc = 0;
07822     unsigned NumVecs = 0;
07823     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
07824     switch (IntNo) {
07825     default: llvm_unreachable("unexpected intrinsic for Neon base update");
07826     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
07827       NumVecs = 2; break;
07828     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
07829       NumVecs = 3; break;
07830     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
07831       NumVecs = 4; break;
07832     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
07833       NumVecs = 2; IsStore = true; break;
07834     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
07835       NumVecs = 3; IsStore = true; break;
07836     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
07837       NumVecs = 4; IsStore = true; break;
07838     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
07839       NumVecs = 2; break;
07840     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
07841       NumVecs = 3; break;
07842     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
07843       NumVecs = 4; break;
07844     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
07845       NumVecs = 2; IsStore = true; break;
07846     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
07847       NumVecs = 3; IsStore = true; break;
07848     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
07849       NumVecs = 4; IsStore = true; break;
07850     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
07851       NumVecs = 2; IsDupOp = true; break;
07852     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
07853       NumVecs = 3; IsDupOp = true; break;
07854     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
07855       NumVecs = 4; IsDupOp = true; break;
07856     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
07857       NumVecs = 2; IsLaneOp = true; break;
07858     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
07859       NumVecs = 3; IsLaneOp = true; break;
07860     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
07861       NumVecs = 4; IsLaneOp = true; break;
07862     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
07863       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
07864     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
07865       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
07866     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
07867       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
07868     }
07869 
07870     EVT VecTy;
07871     if (IsStore)
07872       VecTy = N->getOperand(2).getValueType();
07873     else
07874       VecTy = N->getValueType(0);
07875 
07876     // If the increment is a constant, it must match the memory ref size.
07877     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
07878     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
07879       uint32_t IncVal = CInc->getZExtValue();
07880       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
07881       if (IsLaneOp || IsDupOp)
07882         NumBytes /= VecTy.getVectorNumElements();
07883       if (IncVal != NumBytes)
07884         continue;
07885       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
07886     }
07887     SmallVector<SDValue, 8> Ops;
07888     Ops.push_back(N->getOperand(0)); // Incoming chain
07889     // Load lane and store have vector list as input.
07890     if (IsLaneOp || IsStore)
07891       for (unsigned i = 2; i < AddrOpIdx; ++i)
07892         Ops.push_back(N->getOperand(i));
07893     Ops.push_back(Addr); // Base register
07894     Ops.push_back(Inc);
07895 
07896     // Return Types.
07897     EVT Tys[6];
07898     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
07899     unsigned n;
07900     for (n = 0; n < NumResultVecs; ++n)
07901       Tys[n] = VecTy;
07902     Tys[n++] = MVT::i64;  // Type of write back register
07903     Tys[n] = MVT::Other;  // Type of the chain
07904     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
07905 
07906     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
07907     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
07908                                            MemInt->getMemoryVT(),
07909                                            MemInt->getMemOperand());
07910 
07911     // Update the uses.
07912     std::vector<SDValue> NewResults;
07913     for (unsigned i = 0; i < NumResultVecs; ++i) {
07914       NewResults.push_back(SDValue(UpdN.getNode(), i));
07915     }
07916     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
07917     DCI.CombineTo(N, NewResults);
07918     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
07919 
07920     break;
07921   }
07922   return SDValue();
07923 }
07924 
07925 // Checks to see if the value is the prescribed width and returns information
07926 // about its extension mode.
07927 static
07928 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
07929   ExtType = ISD::NON_EXTLOAD;
07930   switch(V.getNode()->getOpcode()) {
07931   default:
07932     return false;
07933   case ISD::LOAD: {
07934     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
07935     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
07936        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
07937       ExtType = LoadNode->getExtensionType();
07938       return true;
07939     }
07940     return false;
07941   }
07942   case ISD::AssertSext: {
07943     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
07944     if ((TypeNode->getVT() == MVT::i8 && width == 8)
07945        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
07946       ExtType = ISD::SEXTLOAD;
07947       return true;
07948     }
07949     return false;
07950   }
07951   case ISD::AssertZext: {
07952     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
07953     if ((TypeNode->getVT() == MVT::i8 && width == 8)
07954        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
07955       ExtType = ISD::ZEXTLOAD;
07956       return true;
07957     }
07958     return false;
07959   }
07960   case ISD::Constant:
07961   case ISD::TargetConstant: {
07962     if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
07963         1LL << (width - 1))
07964       return true;
07965     return false;
07966   }
07967   }
07968 
07969   return true;
07970 }
07971 
07972 // This function does a whole lot of voodoo to determine if the tests are
07973 // equivalent without and with a mask. Essentially what happens is that given a
07974 // DAG resembling:
07975 //
07976 //  +-------------+ +-------------+ +-------------+ +-------------+
07977 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
07978 //  +-------------+ +-------------+ +-------------+ +-------------+
07979 //           |           |           |               |
07980 //           V           V           |    +----------+
07981 //          +-------------+  +----+  |    |
07982 //          |     ADD     |  |0xff|  |    |
07983 //          +-------------+  +----+  |    |
07984 //                  |           |    |    |
07985 //                  V           V    |    |
07986 //                 +-------------+   |    |
07987 //                 |     AND     |   |    |
07988 //                 +-------------+   |    |
07989 //                      |            |    |
07990 //                      +-----+      |    |
07991 //                            |      |    |
07992 //                            V      V    V
07993 //                           +-------------+
07994 //                           |     CMP     |
07995 //                           +-------------+
07996 //
07997 // The AND node may be safely removed for some combinations of inputs. In
07998 // particular we need to take into account the extension type of the Input,
07999 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
08000 // width of the input (this can work for any width inputs, the above graph is
08001 // specific to 8 bits.
08002 //
08003 // The specific equations were worked out by generating output tables for each
08004 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
08005 // problem was simplified by working with 4 bit inputs, which means we only
08006 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
08007 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
08008 // patterns present in both extensions (0,7). For every distinct set of
08009 // AddConstant and CompConstants bit patterns we can consider the masked and
08010 // unmasked versions to be equivalent if the result of this function is true for
08011 // all 16 distinct bit patterns of for the current extension type of Input (w0).
08012 //
08013 //   sub      w8, w0, w1
08014 //   and      w10, w8, #0x0f
08015 //   cmp      w8, w2
08016 //   cset     w9, AArch64CC
08017 //   cmp      w10, w2
08018 //   cset     w11, AArch64CC
08019 //   cmp      w9, w11
08020 //   cset     w0, eq
08021 //   ret
08022 //
08023 // Since the above function shows when the outputs are equivalent it defines
08024 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
08025 // would be expensive to run during compiles. The equations below were written
08026 // in a test harness that confirmed they gave equivalent outputs to the above
08027 // for all inputs function, so they can be used determine if the removal is
08028 // legal instead.
08029 //
08030 // isEquivalentMaskless() is the code for testing if the AND can be removed
08031 // factored out of the DAG recognition as the DAG can take several forms.
08032 
08033 static
08034 bool isEquivalentMaskless(unsigned CC, unsigned width,
08035                           ISD::LoadExtType ExtType, signed AddConstant,
08036                           signed CompConstant) {
08037   // By being careful about our equations and only writing the in term
08038   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
08039   // make them generally applicable to all bit widths.
08040   signed MaxUInt = (1 << width);
08041 
08042   // For the purposes of these comparisons sign extending the type is
08043   // equivalent to zero extending the add and displacing it by half the integer
08044   // width. Provided we are careful and make sure our equations are valid over
08045   // the whole range we can just adjust the input and avoid writing equations
08046   // for sign extended inputs.
08047   if (ExtType == ISD::SEXTLOAD)
08048     AddConstant -= (1 << (width-1));
08049 
08050   switch(CC) {
08051   case AArch64CC::LE:
08052   case AArch64CC::GT: {
08053     if ((AddConstant == 0) ||
08054         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
08055         (AddConstant >= 0 && CompConstant < 0) ||
08056         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
08057       return true;
08058   } break;
08059   case AArch64CC::LT:
08060   case AArch64CC::GE: {
08061     if ((AddConstant == 0) ||
08062         (AddConstant >= 0 && CompConstant <= 0) ||
08063         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
08064       return true;
08065   } break;
08066   case AArch64CC::HI:
08067   case AArch64CC::LS: {
08068     if ((AddConstant >= 0 && CompConstant < 0) ||
08069        (AddConstant <= 0 && CompConstant >= -1 &&
08070         CompConstant < AddConstant + MaxUInt))
08071       return true;
08072   } break;
08073   case AArch64CC::PL:
08074   case AArch64CC::MI: {
08075     if ((AddConstant == 0) ||
08076         (AddConstant > 0 && CompConstant <= 0) ||
08077         (AddConstant < 0 && CompConstant <= AddConstant))
08078       return true;
08079   } break;
08080   case AArch64CC::LO:
08081   case AArch64CC::HS: {
08082     if ((AddConstant >= 0 && CompConstant <= 0) ||
08083         (AddConstant <= 0 && CompConstant >= 0 &&
08084          CompConstant <= AddConstant + MaxUInt))
08085       return true;
08086   } break;
08087   case AArch64CC::EQ:
08088   case AArch64CC::NE: {
08089     if ((AddConstant > 0 && CompConstant < 0) ||
08090         (AddConstant < 0 && CompConstant >= 0 &&
08091          CompConstant < AddConstant + MaxUInt) ||
08092         (AddConstant >= 0 && CompConstant >= 0 &&
08093          CompConstant >= AddConstant) ||
08094         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
08095 
08096       return true;
08097   } break;
08098   case AArch64CC::VS:
08099   case AArch64CC::VC:
08100   case AArch64CC::AL:
08101   case AArch64CC::NV:
08102     return true;
08103   case AArch64CC::Invalid:
08104     break;
08105   }
08106 
08107   return false;
08108 }
08109 
08110 static
08111 SDValue performCONDCombine(SDNode *N,
08112                            TargetLowering::DAGCombinerInfo &DCI,
08113                            SelectionDAG &DAG, unsigned CCIndex,
08114                            unsigned CmpIndex) {
08115   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
08116   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
08117   unsigned CondOpcode = SubsNode->getOpcode();
08118 
08119   if (CondOpcode != AArch64ISD::SUBS)
08120     return SDValue();
08121 
08122   // There is a SUBS feeding this condition. Is it fed by a mask we can
08123   // use?
08124 
08125   SDNode *AndNode = SubsNode->getOperand(0).getNode();
08126   unsigned MaskBits = 0;
08127 
08128   if (AndNode->getOpcode() != ISD::AND)
08129     return SDValue();
08130 
08131   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
08132     uint32_t CNV = CN->getZExtValue();
08133     if (CNV == 255)
08134       MaskBits = 8;
08135     else if (CNV == 65535)
08136       MaskBits = 16;
08137   }
08138 
08139   if (!MaskBits)
08140     return SDValue();
08141 
08142   SDValue AddValue = AndNode->getOperand(0);
08143 
08144   if (AddValue.getOpcode() != ISD::ADD)
08145     return SDValue();
08146 
08147   // The basic dag structure is correct, grab the inputs and validate them.
08148 
08149   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
08150   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
08151   SDValue SubsInputValue = SubsNode->getOperand(1);
08152 
08153   // The mask is present and the provenance of all the values is a smaller type,
08154   // lets see if the mask is superfluous.
08155 
08156   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
08157       !isa<ConstantSDNode>(SubsInputValue.getNode()))
08158     return SDValue();
08159 
08160   ISD::LoadExtType ExtType;
08161 
08162   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
08163       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
08164       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
08165     return SDValue();
08166 
08167   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
08168                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
08169                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
08170     return SDValue();
08171 
08172   // The AND is not necessary, remove it.
08173 
08174   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
08175                                SubsNode->getValueType(1));
08176   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
08177 
08178   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
08179   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
08180 
08181   return SDValue(N, 0);
08182 }
08183 
08184 // Optimize compare with zero and branch.
08185 static SDValue performBRCONDCombine(SDNode *N,
08186                                     TargetLowering::DAGCombinerInfo &DCI,
08187                                     SelectionDAG &DAG) {
08188   SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
08189   if (NV.getNode())
08190     N = NV.getNode();
08191   SDValue Chain = N->getOperand(0);
08192   SDValue Dest = N->getOperand(1);
08193   SDValue CCVal = N->getOperand(2);
08194   SDValue Cmp = N->getOperand(3);
08195 
08196   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
08197   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
08198   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
08199     return SDValue();
08200 
08201   unsigned CmpOpc = Cmp.getOpcode();
08202   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
08203     return SDValue();
08204 
08205   // Only attempt folding if there is only one use of the flag and no use of the
08206   // value.
08207   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
08208     return SDValue();
08209 
08210   SDValue LHS = Cmp.getOperand(0);
08211   SDValue RHS = Cmp.getOperand(1);
08212 
08213   assert(LHS.getValueType() == RHS.getValueType() &&
08214          "Expected the value type to be the same for both operands!");
08215   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
08216     return SDValue();
08217 
08218   if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
08219     std::swap(LHS, RHS);
08220 
08221   if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
08222     return SDValue();
08223 
08224   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
08225       LHS.getOpcode() == ISD::SRL)
08226     return SDValue();
08227 
08228   // Fold the compare into the branch instruction.
08229   SDValue BR;
08230   if (CC == AArch64CC::EQ)
08231     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
08232   else
08233     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
08234 
08235   // Do not add new nodes to DAG combiner worklist.
08236   DCI.CombineTo(N, BR, false);
08237 
08238   return SDValue();
08239 }
08240 
08241 // vselect (v1i1 setcc) ->
08242 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
08243 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
08244 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
08245 // such VSELECT.
08246 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
08247   SDValue N0 = N->getOperand(0);
08248   EVT CCVT = N0.getValueType();
08249 
08250   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
08251       CCVT.getVectorElementType() != MVT::i1)
08252     return SDValue();
08253 
08254   EVT ResVT = N->getValueType(0);
08255   EVT CmpVT = N0.getOperand(0).getValueType();
08256   // Only combine when the result type is of the same size as the compared
08257   // operands.
08258   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
08259     return SDValue();
08260 
08261   SDValue IfTrue = N->getOperand(1);
08262   SDValue IfFalse = N->getOperand(2);
08263   SDValue SetCC =
08264       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
08265                    N0.getOperand(0), N0.getOperand(1),
08266                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
08267   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
08268                      IfTrue, IfFalse);
08269 }
08270 
08271 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
08272 /// the compare-mask instructions rather than going via NZCV, even if LHS and
08273 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
08274 /// with a vector one followed by a DUP shuffle on the result.
08275 static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
08276   SDValue N0 = N->getOperand(0);
08277   EVT ResVT = N->getValueType(0);
08278 
08279   if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
08280     return SDValue();
08281 
08282   // If NumMaskElts == 0, the comparison is larger than select result. The
08283   // largest real NEON comparison is 64-bits per lane, which means the result is
08284   // at most 32-bits and an illegal vector. Just bail out for now.
08285   EVT SrcVT = N0.getOperand(0).getValueType();
08286   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
08287   if (!ResVT.isVector() || NumMaskElts == 0)
08288     return SDValue();
08289 
08290   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
08291   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
08292 
08293   // First perform a vector comparison, where lane 0 is the one we're interested
08294   // in.
08295   SDLoc DL(N0);
08296   SDValue LHS =
08297       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
08298   SDValue RHS =
08299       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
08300   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
08301 
08302   // Now duplicate the comparison mask we want across all other lanes.
08303   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
08304   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
08305   Mask = DAG.getNode(ISD::BITCAST, DL,
08306                      ResVT.changeVectorElementTypeToInteger(), Mask);
08307 
08308   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
08309 }
08310 
08311 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
08312                                                  DAGCombinerInfo &DCI) const {
08313   SelectionDAG &DAG = DCI.DAG;
08314   switch (N->getOpcode()) {
08315   default:
08316     break;
08317   case ISD::ADD:
08318   case ISD::SUB:
08319     return performAddSubLongCombine(N, DCI, DAG);
08320   case ISD::XOR:
08321     return performXorCombine(N, DAG, DCI, Subtarget);
08322   case ISD::MUL:
08323     return performMulCombine(N, DAG, DCI, Subtarget);
08324   case ISD::SINT_TO_FP:
08325   case ISD::UINT_TO_FP:
08326     return performIntToFpCombine(N, DAG);
08327   case ISD::OR:
08328     return performORCombine(N, DCI, Subtarget);
08329   case ISD::INTRINSIC_WO_CHAIN:
08330     return performIntrinsicCombine(N, DCI, Subtarget);
08331   case ISD::ANY_EXTEND:
08332   case ISD::ZERO_EXTEND:
08333   case ISD::SIGN_EXTEND:
08334     return performExtendCombine(N, DCI, DAG);
08335   case ISD::BITCAST:
08336     return performBitcastCombine(N, DCI, DAG);
08337   case ISD::CONCAT_VECTORS:
08338     return performConcatVectorsCombine(N, DCI, DAG);
08339   case ISD::SELECT:
08340     return performSelectCombine(N, DAG);
08341   case ISD::VSELECT:
08342     return performVSelectCombine(N, DCI.DAG);
08343   case ISD::STORE:
08344     return performSTORECombine(N, DCI, DAG, Subtarget);
08345   case AArch64ISD::BRCOND:
08346     return performBRCONDCombine(N, DCI, DAG);
08347   case AArch64ISD::CSEL:
08348     return performCONDCombine(N, DCI, DAG, 2, 3);
08349   case AArch64ISD::DUP:
08350     return performPostLD1Combine(N, DCI, false);
08351   case ISD::INSERT_VECTOR_ELT:
08352     return performPostLD1Combine(N, DCI, true);
08353   case ISD::INTRINSIC_VOID:
08354   case ISD::INTRINSIC_W_CHAIN:
08355     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
08356     case Intrinsic::aarch64_neon_ld2:
08357     case Intrinsic::aarch64_neon_ld3:
08358     case Intrinsic::aarch64_neon_ld4:
08359     case Intrinsic::aarch64_neon_ld1x2:
08360     case Intrinsic::aarch64_neon_ld1x3:
08361     case Intrinsic::aarch64_neon_ld1x4:
08362     case Intrinsic::aarch64_neon_ld2lane:
08363     case Intrinsic::aarch64_neon_ld3lane:
08364     case Intrinsic::aarch64_neon_ld4lane:
08365     case Intrinsic::aarch64_neon_ld2r:
08366     case Intrinsic::aarch64_neon_ld3r:
08367     case Intrinsic::aarch64_neon_ld4r:
08368     case Intrinsic::aarch64_neon_st2:
08369     case Intrinsic::aarch64_neon_st3:
08370     case Intrinsic::aarch64_neon_st4:
08371     case Intrinsic::aarch64_neon_st1x2:
08372     case Intrinsic::aarch64_neon_st1x3:
08373     case Intrinsic::aarch64_neon_st1x4:
08374     case Intrinsic::aarch64_neon_st2lane:
08375     case Intrinsic::aarch64_neon_st3lane:
08376     case Intrinsic::aarch64_neon_st4lane:
08377       return performNEONPostLDSTCombine(N, DCI, DAG);
08378     default:
08379       break;
08380     }
08381   }
08382   return SDValue();
08383 }
08384 
08385 // Check if the return value is used as only a return value, as otherwise
08386 // we can't perform a tail-call. In particular, we need to check for
08387 // target ISD nodes that are returns and any other "odd" constructs
08388 // that the generic analysis code won't necessarily catch.
08389 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
08390                                                SDValue &Chain) const {
08391   if (N->getNumValues() != 1)
08392     return false;
08393   if (!N->hasNUsesOfValue(1, 0))
08394     return false;
08395 
08396   SDValue TCChain = Chain;
08397   SDNode *Copy = *N->use_begin();
08398   if (Copy->getOpcode() == ISD::CopyToReg) {
08399     // If the copy has a glue operand, we conservatively assume it isn't safe to
08400     // perform a tail call.
08401     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
08402         MVT::Glue)
08403       return false;
08404     TCChain = Copy->getOperand(0);
08405   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
08406     return false;
08407 
08408   bool HasRet = false;
08409   for (SDNode *Node : Copy->uses()) {
08410     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
08411       return false;
08412     HasRet = true;
08413   }
08414 
08415   if (!HasRet)
08416     return false;
08417 
08418   Chain = TCChain;
08419   return true;
08420 }
08421 
08422 // Return whether the an instruction can potentially be optimized to a tail
08423 // call. This will cause the optimizers to attempt to move, or duplicate,
08424 // return instructions to help enable tail call optimizations for this
08425 // instruction.
08426 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
08427   if (!CI->isTailCall())
08428     return false;
08429 
08430   return true;
08431 }
08432 
08433 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
08434                                                    SDValue &Offset,
08435                                                    ISD::MemIndexedMode &AM,
08436                                                    bool &IsInc,
08437                                                    SelectionDAG &DAG) const {
08438   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
08439     return false;
08440 
08441   Base = Op->getOperand(0);
08442   // All of the indexed addressing mode instructions take a signed
08443   // 9 bit immediate offset.
08444   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
08445     int64_t RHSC = (int64_t)RHS->getZExtValue();
08446     if (RHSC >= 256 || RHSC <= -256)
08447       return false;
08448     IsInc = (Op->getOpcode() == ISD::ADD);
08449     Offset = Op->getOperand(1);
08450     return true;
08451   }
08452   return false;
08453 }
08454 
08455 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
08456                                                       SDValue &Offset,
08457                                                       ISD::MemIndexedMode &AM,
08458                                                       SelectionDAG &DAG) const {
08459   EVT VT;
08460   SDValue Ptr;
08461   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
08462     VT = LD->getMemoryVT();
08463     Ptr = LD->getBasePtr();
08464   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
08465     VT = ST->getMemoryVT();
08466     Ptr = ST->getBasePtr();
08467   } else
08468     return false;
08469 
08470   bool IsInc;
08471   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
08472     return false;
08473   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
08474   return true;
08475 }
08476 
08477 bool AArch64TargetLowering::getPostIndexedAddressParts(
08478     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
08479     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
08480   EVT VT;
08481   SDValue Ptr;
08482   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
08483     VT = LD->getMemoryVT();
08484     Ptr = LD->getBasePtr();
08485   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
08486     VT = ST->getMemoryVT();
08487     Ptr = ST->getBasePtr();
08488   } else
08489     return false;
08490 
08491   bool IsInc;
08492   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
08493     return false;
08494   // Post-indexing updates the base, so it's not a valid transform
08495   // if that's not the same as the load's pointer.
08496   if (Ptr != Base)
08497     return false;
08498   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
08499   return true;
08500 }
08501 
08502 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
08503                                   SelectionDAG &DAG) {
08504   if (N->getValueType(0) != MVT::i16)
08505     return;
08506 
08507   SDLoc DL(N);
08508   SDValue Op = N->getOperand(0);
08509   assert(Op.getValueType() == MVT::f16 &&
08510          "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
08511   Op = SDValue(
08512       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
08513                          DAG.getUNDEF(MVT::i32), Op,
08514                          DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
08515       0);
08516   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
08517   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
08518 }
08519 
08520 void AArch64TargetLowering::ReplaceNodeResults(
08521     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
08522   switch (N->getOpcode()) {
08523   default:
08524     llvm_unreachable("Don't know how to custom expand this");
08525   case ISD::BITCAST:
08526     ReplaceBITCASTResults(N, Results, DAG);
08527     return;
08528   case ISD::FP_TO_UINT:
08529   case ISD::FP_TO_SINT:
08530     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
08531     // Let normal code take care of it by not adding anything to Results.
08532     return;
08533   }
08534 }
08535 
08536 bool AArch64TargetLowering::useLoadStackGuardNode() const {
08537   return true;
08538 }
08539 
08540 TargetLoweringBase::LegalizeTypeAction
08541 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
08542   MVT SVT = VT.getSimpleVT();
08543   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
08544   // v4i16, v2i32 instead of to promote.
08545   if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
08546       || SVT == MVT::v1f32)
08547     return TypeWidenVector;
08548 
08549   return TargetLoweringBase::getPreferredVectorAction(VT);
08550 }
08551 
08552 // Loads and stores less than 128-bits are already atomic; ones above that
08553 // are doomed anyway, so defer to the default libcall and blame the OS when
08554 // things go wrong.
08555 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
08556   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
08557   return Size == 128;
08558 }
08559 
08560 // Loads and stores less than 128-bits are already atomic; ones above that
08561 // are doomed anyway, so defer to the default libcall and blame the OS when
08562 // things go wrong.
08563 bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
08564   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
08565   return Size == 128;
08566 }
08567 
08568 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
08569 bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
08570   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
08571   return Size <= 128;
08572 }
08573 
08574 bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
08575   return true;
08576 }
08577 
08578 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
08579                                              AtomicOrdering Ord) const {
08580   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
08581   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
08582   bool IsAcquire = isAtLeastAcquire(Ord);
08583 
08584   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
08585   // intrinsic must return {i64, i64} and we have to recombine them into a
08586   // single i128 here.
08587   if (ValTy->getPrimitiveSizeInBits() == 128) {
08588     Intrinsic::ID Int =
08589         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
08590     Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
08591 
08592     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
08593     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
08594 
08595     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
08596     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
08597     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
08598     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
08599     return Builder.CreateOr(
08600         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
08601   }
08602 
08603   Type *Tys[] = { Addr->getType() };
08604   Intrinsic::ID Int =
08605       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
08606   Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
08607 
08608   return Builder.CreateTruncOrBitCast(
08609       Builder.CreateCall(Ldxr, Addr),
08610       cast<PointerType>(Addr->getType())->getElementType());
08611 }
08612 
08613 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
08614                                                    Value *Val, Value *Addr,
08615                                                    AtomicOrdering Ord) const {
08616   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
08617   bool IsRelease = isAtLeastRelease(Ord);
08618 
08619   // Since the intrinsics must have legal type, the i128 intrinsics take two
08620   // parameters: "i64, i64". We must marshal Val into the appropriate form
08621   // before the call.
08622   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
08623     Intrinsic::ID Int =
08624         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
08625     Function *Stxr = Intrinsic::getDeclaration(M, Int);
08626     Type *Int64Ty = Type::getInt64Ty(M->getContext());
08627 
08628     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
08629     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
08630     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
08631     return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
08632   }
08633 
08634   Intrinsic::ID Int =
08635       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
08636   Type *Tys[] = { Addr->getType() };
08637   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
08638 
08639   return Builder.CreateCall2(
08640       Stxr, Builder.CreateZExtOrBitCast(
08641                 Val, Stxr->getFunctionType()->getParamType(0)),
08642       Addr);
08643 }