LLVM API Documentation

AArch64LoadStoreOptimizer.cpp
Go to the documentation of this file.
00001 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file contains a pass that performs load / store related peephole
00011 // optimizations. This pass should be run after register allocation.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "AArch64InstrInfo.h"
00016 #include "AArch64Subtarget.h"
00017 #include "MCTargetDesc/AArch64AddressingModes.h"
00018 #include "llvm/ADT/BitVector.h"
00019 #include "llvm/ADT/Statistic.h"
00020 #include "llvm/CodeGen/MachineBasicBlock.h"
00021 #include "llvm/CodeGen/MachineFunctionPass.h"
00022 #include "llvm/CodeGen/MachineInstr.h"
00023 #include "llvm/CodeGen/MachineInstrBuilder.h"
00024 #include "llvm/Support/CommandLine.h"
00025 #include "llvm/Support/Debug.h"
00026 #include "llvm/Support/ErrorHandling.h"
00027 #include "llvm/Support/raw_ostream.h"
00028 #include "llvm/Target/TargetInstrInfo.h"
00029 #include "llvm/Target/TargetMachine.h"
00030 #include "llvm/Target/TargetRegisterInfo.h"
00031 using namespace llvm;
00032 
00033 #define DEBUG_TYPE "aarch64-ldst-opt"
00034 
00035 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
00036 /// load / store instructions to form ldp / stp instructions.
00037 
00038 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
00039 STATISTIC(NumPostFolded, "Number of post-index updates folded");
00040 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
00041 STATISTIC(NumUnscaledPairCreated,
00042           "Number of load/store from unscaled generated");
00043 
00044 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
00045                                    cl::init(20), cl::Hidden);
00046 
00047 // Place holder while testing unscaled load/store combining
00048 static cl::opt<bool> EnableAArch64UnscaledMemOp(
00049     "aarch64-unscaled-mem-op", cl::Hidden,
00050     cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
00051 
00052 namespace {
00053 struct AArch64LoadStoreOpt : public MachineFunctionPass {
00054   static char ID;
00055   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
00056 
00057   const AArch64InstrInfo *TII;
00058   const TargetRegisterInfo *TRI;
00059 
00060   // Scan the instructions looking for a load/store that can be combined
00061   // with the current instruction into a load/store pair.
00062   // Return the matching instruction if one is found, else MBB->end().
00063   // If a matching instruction is found, MergeForward is set to true if the
00064   // merge is to remove the first instruction and replace the second with
00065   // a pair-wise insn, and false if the reverse is true.
00066   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
00067                                                bool &MergeForward,
00068                                                unsigned Limit);
00069   // Merge the two instructions indicated into a single pair-wise instruction.
00070   // If MergeForward is true, erase the first instruction and fold its
00071   // operation into the second. If false, the reverse. Return the instruction
00072   // following the first instruction (which may change during processing).
00073   MachineBasicBlock::iterator
00074   mergePairedInsns(MachineBasicBlock::iterator I,
00075                    MachineBasicBlock::iterator Paired, bool MergeForward);
00076 
00077   // Scan the instruction list to find a base register update that can
00078   // be combined with the current instruction (a load or store) using
00079   // pre or post indexed addressing with writeback. Scan forwards.
00080   MachineBasicBlock::iterator
00081   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
00082                                 int Value);
00083 
00084   // Scan the instruction list to find a base register update that can
00085   // be combined with the current instruction (a load or store) using
00086   // pre or post indexed addressing with writeback. Scan backwards.
00087   MachineBasicBlock::iterator
00088   findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
00089 
00090   // Merge a pre-index base register update into a ld/st instruction.
00091   MachineBasicBlock::iterator
00092   mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
00093                         MachineBasicBlock::iterator Update);
00094 
00095   // Merge a post-index base register update into a ld/st instruction.
00096   MachineBasicBlock::iterator
00097   mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
00098                          MachineBasicBlock::iterator Update);
00099 
00100   bool optimizeBlock(MachineBasicBlock &MBB);
00101 
00102   bool runOnMachineFunction(MachineFunction &Fn) override;
00103 
00104   const char *getPassName() const override {
00105     return "AArch64 load / store optimization pass";
00106   }
00107 
00108 private:
00109   int getMemSize(MachineInstr *MemMI);
00110 };
00111 char AArch64LoadStoreOpt::ID = 0;
00112 } // namespace
00113 
00114 static bool isUnscaledLdst(unsigned Opc) {
00115   switch (Opc) {
00116   default:
00117     return false;
00118   case AArch64::STURSi:
00119     return true;
00120   case AArch64::STURDi:
00121     return true;
00122   case AArch64::STURQi:
00123     return true;
00124   case AArch64::STURWi:
00125     return true;
00126   case AArch64::STURXi:
00127     return true;
00128   case AArch64::LDURSi:
00129     return true;
00130   case AArch64::LDURDi:
00131     return true;
00132   case AArch64::LDURQi:
00133     return true;
00134   case AArch64::LDURWi:
00135     return true;
00136   case AArch64::LDURXi:
00137     return true;
00138   }
00139 }
00140 
00141 // Size in bytes of the data moved by an unscaled load or store
00142 int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
00143   switch (MemMI->getOpcode()) {
00144   default:
00145     llvm_unreachable("Opcode has unknown size!");
00146   case AArch64::STRSui:
00147   case AArch64::STURSi:
00148     return 4;
00149   case AArch64::STRDui:
00150   case AArch64::STURDi:
00151     return 8;
00152   case AArch64::STRQui:
00153   case AArch64::STURQi:
00154     return 16;
00155   case AArch64::STRWui:
00156   case AArch64::STURWi:
00157     return 4;
00158   case AArch64::STRXui:
00159   case AArch64::STURXi:
00160     return 8;
00161   case AArch64::LDRSui:
00162   case AArch64::LDURSi:
00163     return 4;
00164   case AArch64::LDRDui:
00165   case AArch64::LDURDi:
00166     return 8;
00167   case AArch64::LDRQui:
00168   case AArch64::LDURQi:
00169     return 16;
00170   case AArch64::LDRWui:
00171   case AArch64::LDURWi:
00172     return 4;
00173   case AArch64::LDRXui:
00174   case AArch64::LDURXi:
00175     return 8;
00176   }
00177 }
00178 
00179 static unsigned getMatchingPairOpcode(unsigned Opc) {
00180   switch (Opc) {
00181   default:
00182     llvm_unreachable("Opcode has no pairwise equivalent!");
00183   case AArch64::STRSui:
00184   case AArch64::STURSi:
00185     return AArch64::STPSi;
00186   case AArch64::STRDui:
00187   case AArch64::STURDi:
00188     return AArch64::STPDi;
00189   case AArch64::STRQui:
00190   case AArch64::STURQi:
00191     return AArch64::STPQi;
00192   case AArch64::STRWui:
00193   case AArch64::STURWi:
00194     return AArch64::STPWi;
00195   case AArch64::STRXui:
00196   case AArch64::STURXi:
00197     return AArch64::STPXi;
00198   case AArch64::LDRSui:
00199   case AArch64::LDURSi:
00200     return AArch64::LDPSi;
00201   case AArch64::LDRDui:
00202   case AArch64::LDURDi:
00203     return AArch64::LDPDi;
00204   case AArch64::LDRQui:
00205   case AArch64::LDURQi:
00206     return AArch64::LDPQi;
00207   case AArch64::LDRWui:
00208   case AArch64::LDURWi:
00209     return AArch64::LDPWi;
00210   case AArch64::LDRXui:
00211   case AArch64::LDURXi:
00212     return AArch64::LDPXi;
00213   }
00214 }
00215 
00216 static unsigned getPreIndexedOpcode(unsigned Opc) {
00217   switch (Opc) {
00218   default:
00219     llvm_unreachable("Opcode has no pre-indexed equivalent!");
00220   case AArch64::STRSui:
00221     return AArch64::STRSpre;
00222   case AArch64::STRDui:
00223     return AArch64::STRDpre;
00224   case AArch64::STRQui:
00225     return AArch64::STRQpre;
00226   case AArch64::STRWui:
00227     return AArch64::STRWpre;
00228   case AArch64::STRXui:
00229     return AArch64::STRXpre;
00230   case AArch64::LDRSui:
00231     return AArch64::LDRSpre;
00232   case AArch64::LDRDui:
00233     return AArch64::LDRDpre;
00234   case AArch64::LDRQui:
00235     return AArch64::LDRQpre;
00236   case AArch64::LDRWui:
00237     return AArch64::LDRWpre;
00238   case AArch64::LDRXui:
00239     return AArch64::LDRXpre;
00240   }
00241 }
00242 
00243 static unsigned getPostIndexedOpcode(unsigned Opc) {
00244   switch (Opc) {
00245   default:
00246     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
00247   case AArch64::STRSui:
00248     return AArch64::STRSpost;
00249   case AArch64::STRDui:
00250     return AArch64::STRDpost;
00251   case AArch64::STRQui:
00252     return AArch64::STRQpost;
00253   case AArch64::STRWui:
00254     return AArch64::STRWpost;
00255   case AArch64::STRXui:
00256     return AArch64::STRXpost;
00257   case AArch64::LDRSui:
00258     return AArch64::LDRSpost;
00259   case AArch64::LDRDui:
00260     return AArch64::LDRDpost;
00261   case AArch64::LDRQui:
00262     return AArch64::LDRQpost;
00263   case AArch64::LDRWui:
00264     return AArch64::LDRWpost;
00265   case AArch64::LDRXui:
00266     return AArch64::LDRXpost;
00267   }
00268 }
00269 
00270 MachineBasicBlock::iterator
00271 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
00272                                       MachineBasicBlock::iterator Paired,
00273                                       bool MergeForward) {
00274   MachineBasicBlock::iterator NextI = I;
00275   ++NextI;
00276   // If NextI is the second of the two instructions to be merged, we need
00277   // to skip one further. Either way we merge will invalidate the iterator,
00278   // and we don't need to scan the new instruction, as it's a pairwise
00279   // instruction, which we're not considering for further action anyway.
00280   if (NextI == Paired)
00281     ++NextI;
00282 
00283   bool IsUnscaled = isUnscaledLdst(I->getOpcode());
00284   int OffsetStride =
00285       IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
00286 
00287   unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
00288   // Insert our new paired instruction after whichever of the paired
00289   // instructions MergeForward indicates.
00290   MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
00291   // Also based on MergeForward is from where we copy the base register operand
00292   // so we get the flags compatible with the input code.
00293   MachineOperand &BaseRegOp =
00294       MergeForward ? Paired->getOperand(1) : I->getOperand(1);
00295 
00296   // Which register is Rt and which is Rt2 depends on the offset order.
00297   MachineInstr *RtMI, *Rt2MI;
00298   if (I->getOperand(2).getImm() ==
00299       Paired->getOperand(2).getImm() + OffsetStride) {
00300     RtMI = Paired;
00301     Rt2MI = I;
00302   } else {
00303     RtMI = I;
00304     Rt2MI = Paired;
00305   }
00306   // Handle Unscaled
00307   int OffsetImm = RtMI->getOperand(2).getImm();
00308   if (IsUnscaled && EnableAArch64UnscaledMemOp)
00309     OffsetImm /= OffsetStride;
00310 
00311   // Construct the new instruction.
00312   MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
00313                                     I->getDebugLoc(), TII->get(NewOpc))
00314                                 .addOperand(RtMI->getOperand(0))
00315                                 .addOperand(Rt2MI->getOperand(0))
00316                                 .addOperand(BaseRegOp)
00317                                 .addImm(OffsetImm);
00318   (void)MIB;
00319 
00320   // FIXME: Do we need/want to copy the mem operands from the source
00321   //        instructions? Probably. What uses them after this?
00322 
00323   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
00324   DEBUG(I->print(dbgs()));
00325   DEBUG(dbgs() << "    ");
00326   DEBUG(Paired->print(dbgs()));
00327   DEBUG(dbgs() << "  with instruction:\n    ");
00328   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
00329   DEBUG(dbgs() << "\n");
00330 
00331   // Erase the old instructions.
00332   I->eraseFromParent();
00333   Paired->eraseFromParent();
00334 
00335   return NextI;
00336 }
00337 
00338 /// trackRegDefsUses - Remember what registers the specified instruction uses
00339 /// and modifies.
00340 static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
00341                              BitVector &UsedRegs,
00342                              const TargetRegisterInfo *TRI) {
00343   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
00344     MachineOperand &MO = MI->getOperand(i);
00345     if (MO.isRegMask())
00346       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
00347 
00348     if (!MO.isReg())
00349       continue;
00350     unsigned Reg = MO.getReg();
00351     if (MO.isDef()) {
00352       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
00353         ModifiedRegs.set(*AI);
00354     } else {
00355       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
00356       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
00357         UsedRegs.set(*AI);
00358     }
00359   }
00360 }
00361 
00362 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
00363   if (!IsUnscaled && (Offset > 63 || Offset < -64))
00364     return false;
00365   if (IsUnscaled) {
00366     // Convert the byte-offset used by unscaled into an "element" offset used
00367     // by the scaled pair load/store instructions.
00368     int ElemOffset = Offset / OffsetStride;
00369     if (ElemOffset > 63 || ElemOffset < -64)
00370       return false;
00371   }
00372   return true;
00373 }
00374 
00375 // Do alignment, specialized to power of 2 and for signed ints,
00376 // avoiding having to do a C-style cast from uint_64t to int when
00377 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
00378 // FIXME: Move this function to include/MathExtras.h?
00379 static int alignTo(int Num, int PowOf2) {
00380   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
00381 }
00382 
00383 /// findMatchingInsn - Scan the instructions looking for a load/store that can
00384 /// be combined with the current instruction into a load/store pair.
00385 MachineBasicBlock::iterator
00386 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
00387                                       bool &MergeForward, unsigned Limit) {
00388   MachineBasicBlock::iterator E = I->getParent()->end();
00389   MachineBasicBlock::iterator MBBI = I;
00390   MachineInstr *FirstMI = I;
00391   ++MBBI;
00392 
00393   int Opc = FirstMI->getOpcode();
00394   bool MayLoad = FirstMI->mayLoad();
00395   bool IsUnscaled = isUnscaledLdst(Opc);
00396   unsigned Reg = FirstMI->getOperand(0).getReg();
00397   unsigned BaseReg = FirstMI->getOperand(1).getReg();
00398   int Offset = FirstMI->getOperand(2).getImm();
00399 
00400   // Early exit if the first instruction modifies the base register.
00401   // e.g., ldr x0, [x0]
00402   // Early exit if the offset if not possible to match. (6 bits of positive
00403   // range, plus allow an extra one in case we find a later insn that matches
00404   // with Offset-1
00405   if (FirstMI->modifiesRegister(BaseReg, TRI))
00406     return E;
00407   int OffsetStride =
00408       IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
00409   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
00410     return E;
00411 
00412   // Track which registers have been modified and used between the first insn
00413   // (inclusive) and the second insn.
00414   BitVector ModifiedRegs, UsedRegs;
00415   ModifiedRegs.resize(TRI->getNumRegs());
00416   UsedRegs.resize(TRI->getNumRegs());
00417   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
00418     MachineInstr *MI = MBBI;
00419     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
00420     // optimization by changing how far we scan.
00421     if (MI->isDebugValue())
00422       continue;
00423 
00424     // Now that we know this is a real instruction, count it.
00425     ++Count;
00426 
00427     if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
00428       // If we've found another instruction with the same opcode, check to see
00429       // if the base and offset are compatible with our starting instruction.
00430       // These instructions all have scaled immediate operands, so we just
00431       // check for +1/-1. Make sure to check the new instruction offset is
00432       // actually an immediate and not a symbolic reference destined for
00433       // a relocation.
00434       //
00435       // Pairwise instructions have a 7-bit signed offset field. Single insns
00436       // have a 12-bit unsigned offset field. To be a valid combine, the
00437       // final offset must be in range.
00438       unsigned MIBaseReg = MI->getOperand(1).getReg();
00439       int MIOffset = MI->getOperand(2).getImm();
00440       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
00441                                    (Offset + OffsetStride == MIOffset))) {
00442         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
00443         // If this is a volatile load/store that otherwise matched, stop looking
00444         // as something is going on that we don't have enough information to
00445         // safely transform. Similarly, stop if we see a hint to avoid pairs.
00446         if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
00447           return E;
00448         // If the resultant immediate offset of merging these instructions
00449         // is out of range for a pairwise instruction, bail and keep looking.
00450         bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
00451         if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
00452           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
00453           continue;
00454         }
00455         // If the alignment requirements of the paired (scaled) instruction
00456         // can't express the offset of the unscaled input, bail and keep
00457         // looking.
00458         if (IsUnscaled && EnableAArch64UnscaledMemOp &&
00459             (alignTo(MinOffset, OffsetStride) != MinOffset)) {
00460           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
00461           continue;
00462         }
00463         // If the destination register of the loads is the same register, bail
00464         // and keep looking. A load-pair instruction with both destination
00465         // registers the same is UNPREDICTABLE and will result in an exception.
00466         if (MayLoad && Reg == MI->getOperand(0).getReg()) {
00467           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
00468           continue;
00469         }
00470 
00471         // If the Rt of the second instruction was not modified or used between
00472         // the two instructions, we can combine the second into the first.
00473         if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
00474             !UsedRegs[MI->getOperand(0).getReg()]) {
00475           MergeForward = false;
00476           return MBBI;
00477         }
00478 
00479         // Likewise, if the Rt of the first instruction is not modified or used
00480         // between the two instructions, we can combine the first into the
00481         // second.
00482         if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
00483             !UsedRegs[FirstMI->getOperand(0).getReg()]) {
00484           MergeForward = true;
00485           return MBBI;
00486         }
00487         // Unable to combine these instructions due to interference in between.
00488         // Keep looking.
00489       }
00490     }
00491 
00492     // If the instruction wasn't a matching load or store, but does (or can)
00493     // modify memory, stop searching, as we don't have alias analysis or
00494     // anything like that to tell us whether the access is tromping on the
00495     // locations we care about. The big one we want to catch is calls.
00496     //
00497     // FIXME: Theoretically, we can do better than that for SP and FP based
00498     // references since we can effectively know where those are touching. It's
00499     // unclear if it's worth the extra code, though. Most paired instructions
00500     // will be sequential, perhaps with a few intervening non-memory related
00501     // instructions.
00502     if (MI->mayStore() || MI->isCall())
00503       return E;
00504     // Likewise, if we're matching a store instruction, we don't want to
00505     // move across a load, as it may be reading the same location.
00506     if (FirstMI->mayStore() && MI->mayLoad())
00507       return E;
00508 
00509     // Update modified / uses register lists.
00510     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
00511 
00512     // Otherwise, if the base register is modified, we have no match, so
00513     // return early.
00514     if (ModifiedRegs[BaseReg])
00515       return E;
00516   }
00517   return E;
00518 }
00519 
00520 MachineBasicBlock::iterator
00521 AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
00522                                            MachineBasicBlock::iterator Update) {
00523   assert((Update->getOpcode() == AArch64::ADDXri ||
00524           Update->getOpcode() == AArch64::SUBXri) &&
00525          "Unexpected base register update instruction to merge!");
00526   MachineBasicBlock::iterator NextI = I;
00527   // Return the instruction following the merged instruction, which is
00528   // the instruction following our unmerged load. Unless that's the add/sub
00529   // instruction we're merging, in which case it's the one after that.
00530   if (++NextI == Update)
00531     ++NextI;
00532 
00533   int Value = Update->getOperand(2).getImm();
00534   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
00535          "Can't merge 1 << 12 offset into pre-indexed load / store");
00536   if (Update->getOpcode() == AArch64::SUBXri)
00537     Value = -Value;
00538 
00539   unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
00540   MachineInstrBuilder MIB =
00541       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
00542           .addOperand(Update->getOperand(0))
00543           .addOperand(I->getOperand(0))
00544           .addOperand(I->getOperand(1))
00545           .addImm(Value);
00546   (void)MIB;
00547 
00548   DEBUG(dbgs() << "Creating pre-indexed load/store.");
00549   DEBUG(dbgs() << "    Replacing instructions:\n    ");
00550   DEBUG(I->print(dbgs()));
00551   DEBUG(dbgs() << "    ");
00552   DEBUG(Update->print(dbgs()));
00553   DEBUG(dbgs() << "  with instruction:\n    ");
00554   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
00555   DEBUG(dbgs() << "\n");
00556 
00557   // Erase the old instructions for the block.
00558   I->eraseFromParent();
00559   Update->eraseFromParent();
00560 
00561   return NextI;
00562 }
00563 
00564 MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
00565     MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
00566   assert((Update->getOpcode() == AArch64::ADDXri ||
00567           Update->getOpcode() == AArch64::SUBXri) &&
00568          "Unexpected base register update instruction to merge!");
00569   MachineBasicBlock::iterator NextI = I;
00570   // Return the instruction following the merged instruction, which is
00571   // the instruction following our unmerged load. Unless that's the add/sub
00572   // instruction we're merging, in which case it's the one after that.
00573   if (++NextI == Update)
00574     ++NextI;
00575 
00576   int Value = Update->getOperand(2).getImm();
00577   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
00578          "Can't merge 1 << 12 offset into post-indexed load / store");
00579   if (Update->getOpcode() == AArch64::SUBXri)
00580     Value = -Value;
00581 
00582   unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
00583   MachineInstrBuilder MIB =
00584       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
00585           .addOperand(Update->getOperand(0))
00586           .addOperand(I->getOperand(0))
00587           .addOperand(I->getOperand(1))
00588           .addImm(Value);
00589   (void)MIB;
00590 
00591   DEBUG(dbgs() << "Creating post-indexed load/store.");
00592   DEBUG(dbgs() << "    Replacing instructions:\n    ");
00593   DEBUG(I->print(dbgs()));
00594   DEBUG(dbgs() << "    ");
00595   DEBUG(Update->print(dbgs()));
00596   DEBUG(dbgs() << "  with instruction:\n    ");
00597   DEBUG(((MachineInstr *)MIB)->print(dbgs()));
00598   DEBUG(dbgs() << "\n");
00599 
00600   // Erase the old instructions for the block.
00601   I->eraseFromParent();
00602   Update->eraseFromParent();
00603 
00604   return NextI;
00605 }
00606 
00607 static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
00608                                  int Offset) {
00609   switch (MI->getOpcode()) {
00610   default:
00611     break;
00612   case AArch64::SUBXri:
00613     // Negate the offset for a SUB instruction.
00614     Offset *= -1;
00615   // FALLTHROUGH
00616   case AArch64::ADDXri:
00617     // Make sure it's a vanilla immediate operand, not a relocation or
00618     // anything else we can't handle.
00619     if (!MI->getOperand(2).isImm())
00620       break;
00621     // Watch out for 1 << 12 shifted value.
00622     if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
00623       break;
00624     // If the instruction has the base register as source and dest and the
00625     // immediate will fit in a signed 9-bit integer, then we have a match.
00626     if (MI->getOperand(0).getReg() == BaseReg &&
00627         MI->getOperand(1).getReg() == BaseReg &&
00628         MI->getOperand(2).getImm() <= 255 &&
00629         MI->getOperand(2).getImm() >= -256) {
00630       // If we have a non-zero Offset, we check that it matches the amount
00631       // we're adding to the register.
00632       if (!Offset || Offset == MI->getOperand(2).getImm())
00633         return true;
00634     }
00635     break;
00636   }
00637   return false;
00638 }
00639 
00640 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
00641     MachineBasicBlock::iterator I, unsigned Limit, int Value) {
00642   MachineBasicBlock::iterator E = I->getParent()->end();
00643   MachineInstr *MemMI = I;
00644   MachineBasicBlock::iterator MBBI = I;
00645   const MachineFunction &MF = *MemMI->getParent()->getParent();
00646 
00647   unsigned DestReg = MemMI->getOperand(0).getReg();
00648   unsigned BaseReg = MemMI->getOperand(1).getReg();
00649   int Offset = MemMI->getOperand(2).getImm() *
00650                TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
00651 
00652   // If the base register overlaps the destination register, we can't
00653   // merge the update.
00654   if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
00655     return E;
00656 
00657   // Scan forward looking for post-index opportunities.
00658   // Updating instructions can't be formed if the memory insn already
00659   // has an offset other than the value we're looking for.
00660   if (Offset != Value)
00661     return E;
00662 
00663   // Track which registers have been modified and used between the first insn
00664   // (inclusive) and the second insn.
00665   BitVector ModifiedRegs, UsedRegs;
00666   ModifiedRegs.resize(TRI->getNumRegs());
00667   UsedRegs.resize(TRI->getNumRegs());
00668   ++MBBI;
00669   for (unsigned Count = 0; MBBI != E; ++MBBI) {
00670     MachineInstr *MI = MBBI;
00671     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
00672     // optimization by changing how far we scan.
00673     if (MI->isDebugValue())
00674       continue;
00675 
00676     // Now that we know this is a real instruction, count it.
00677     ++Count;
00678 
00679     // If we found a match, return it.
00680     if (isMatchingUpdateInsn(MI, BaseReg, Value))
00681       return MBBI;
00682 
00683     // Update the status of what the instruction clobbered and used.
00684     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
00685 
00686     // Otherwise, if the base register is used or modified, we have no match, so
00687     // return early.
00688     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
00689       return E;
00690   }
00691   return E;
00692 }
00693 
00694 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
00695     MachineBasicBlock::iterator I, unsigned Limit) {
00696   MachineBasicBlock::iterator B = I->getParent()->begin();
00697   MachineBasicBlock::iterator E = I->getParent()->end();
00698   MachineInstr *MemMI = I;
00699   MachineBasicBlock::iterator MBBI = I;
00700   const MachineFunction &MF = *MemMI->getParent()->getParent();
00701 
00702   unsigned DestReg = MemMI->getOperand(0).getReg();
00703   unsigned BaseReg = MemMI->getOperand(1).getReg();
00704   int Offset = MemMI->getOperand(2).getImm();
00705   unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
00706 
00707   // If the load/store is the first instruction in the block, there's obviously
00708   // not any matching update. Ditto if the memory offset isn't zero.
00709   if (MBBI == B || Offset != 0)
00710     return E;
00711   // If the base register overlaps the destination register, we can't
00712   // merge the update.
00713   if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
00714     return E;
00715 
00716   // Track which registers have been modified and used between the first insn
00717   // (inclusive) and the second insn.
00718   BitVector ModifiedRegs, UsedRegs;
00719   ModifiedRegs.resize(TRI->getNumRegs());
00720   UsedRegs.resize(TRI->getNumRegs());
00721   --MBBI;
00722   for (unsigned Count = 0; MBBI != B; --MBBI) {
00723     MachineInstr *MI = MBBI;
00724     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
00725     // optimization by changing how far we scan.
00726     if (MI->isDebugValue())
00727       continue;
00728 
00729     // Now that we know this is a real instruction, count it.
00730     ++Count;
00731 
00732     // If we found a match, return it.
00733     if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
00734       return MBBI;
00735 
00736     // Update the status of what the instruction clobbered and used.
00737     trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
00738 
00739     // Otherwise, if the base register is used or modified, we have no match, so
00740     // return early.
00741     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
00742       return E;
00743   }
00744   return E;
00745 }
00746 
00747 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
00748   bool Modified = false;
00749   // Two tranformations to do here:
00750   // 1) Find loads and stores that can be merged into a single load or store
00751   //    pair instruction.
00752   //      e.g.,
00753   //        ldr x0, [x2]
00754   //        ldr x1, [x2, #8]
00755   //        ; becomes
00756   //        ldp x0, x1, [x2]
00757   // 2) Find base register updates that can be merged into the load or store
00758   //    as a base-reg writeback.
00759   //      e.g.,
00760   //        ldr x0, [x2]
00761   //        add x2, x2, #4
00762   //        ; becomes
00763   //        ldr x0, [x2], #4
00764 
00765   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
00766        MBBI != E;) {
00767     MachineInstr *MI = MBBI;
00768     switch (MI->getOpcode()) {
00769     default:
00770       // Just move on to the next instruction.
00771       ++MBBI;
00772       break;
00773     case AArch64::STRSui:
00774     case AArch64::STRDui:
00775     case AArch64::STRQui:
00776     case AArch64::STRXui:
00777     case AArch64::STRWui:
00778     case AArch64::LDRSui:
00779     case AArch64::LDRDui:
00780     case AArch64::LDRQui:
00781     case AArch64::LDRXui:
00782     case AArch64::LDRWui:
00783     // do the unscaled versions as well
00784     case AArch64::STURSi:
00785     case AArch64::STURDi:
00786     case AArch64::STURQi:
00787     case AArch64::STURWi:
00788     case AArch64::STURXi:
00789     case AArch64::LDURSi:
00790     case AArch64::LDURDi:
00791     case AArch64::LDURQi:
00792     case AArch64::LDURWi:
00793     case AArch64::LDURXi: {
00794       // If this is a volatile load/store, don't mess with it.
00795       if (MI->hasOrderedMemoryRef()) {
00796         ++MBBI;
00797         break;
00798       }
00799       // Make sure this is a reg+imm (as opposed to an address reloc).
00800       if (!MI->getOperand(2).isImm()) {
00801         ++MBBI;
00802         break;
00803       }
00804       // Check if this load/store has a hint to avoid pair formation.
00805       // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
00806       if (TII->isLdStPairSuppressed(MI)) {
00807         ++MBBI;
00808         break;
00809       }
00810       // Look ahead up to ScanLimit instructions for a pairable instruction.
00811       bool MergeForward = false;
00812       MachineBasicBlock::iterator Paired =
00813           findMatchingInsn(MBBI, MergeForward, ScanLimit);
00814       if (Paired != E) {
00815         // Merge the loads into a pair. Keeping the iterator straight is a
00816         // pain, so we let the merge routine tell us what the next instruction
00817         // is after it's done mucking about.
00818         MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
00819 
00820         Modified = true;
00821         ++NumPairCreated;
00822         if (isUnscaledLdst(MI->getOpcode()))
00823           ++NumUnscaledPairCreated;
00824         break;
00825       }
00826       ++MBBI;
00827       break;
00828     }
00829       // FIXME: Do the other instructions.
00830     }
00831   }
00832 
00833   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
00834        MBBI != E;) {
00835     MachineInstr *MI = MBBI;
00836     // Do update merging. It's simpler to keep this separate from the above
00837     // switch, though not strictly necessary.
00838     int Opc = MI->getOpcode();
00839     switch (Opc) {
00840     default:
00841       // Just move on to the next instruction.
00842       ++MBBI;
00843       break;
00844     case AArch64::STRSui:
00845     case AArch64::STRDui:
00846     case AArch64::STRQui:
00847     case AArch64::STRXui:
00848     case AArch64::STRWui:
00849     case AArch64::LDRSui:
00850     case AArch64::LDRDui:
00851     case AArch64::LDRQui:
00852     case AArch64::LDRXui:
00853     case AArch64::LDRWui:
00854     // do the unscaled versions as well
00855     case AArch64::STURSi:
00856     case AArch64::STURDi:
00857     case AArch64::STURQi:
00858     case AArch64::STURWi:
00859     case AArch64::STURXi:
00860     case AArch64::LDURSi:
00861     case AArch64::LDURDi:
00862     case AArch64::LDURQi:
00863     case AArch64::LDURWi:
00864     case AArch64::LDURXi: {
00865       // Make sure this is a reg+imm (as opposed to an address reloc).
00866       if (!MI->getOperand(2).isImm()) {
00867         ++MBBI;
00868         break;
00869       }
00870       // Look ahead up to ScanLimit instructions for a mergable instruction.
00871       MachineBasicBlock::iterator Update =
00872           findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
00873       if (Update != E) {
00874         // Merge the update into the ld/st.
00875         MBBI = mergePostIdxUpdateInsn(MBBI, Update);
00876         Modified = true;
00877         ++NumPostFolded;
00878         break;
00879       }
00880       // Don't know how to handle pre/post-index versions, so move to the next
00881       // instruction.
00882       if (isUnscaledLdst(Opc)) {
00883         ++MBBI;
00884         break;
00885       }
00886 
00887       // Look back to try to find a pre-index instruction. For example,
00888       // add x0, x0, #8
00889       // ldr x1, [x0]
00890       //   merged into:
00891       // ldr x1, [x0, #8]!
00892       Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
00893       if (Update != E) {
00894         // Merge the update into the ld/st.
00895         MBBI = mergePreIdxUpdateInsn(MBBI, Update);
00896         Modified = true;
00897         ++NumPreFolded;
00898         break;
00899       }
00900 
00901       // Look forward to try to find a post-index instruction. For example,
00902       // ldr x1, [x0, #64]
00903       // add x0, x0, #64
00904       //   merged into:
00905       // ldr x1, [x0, #64]!
00906 
00907       // The immediate in the load/store is scaled by the size of the register
00908       // being loaded. The immediate in the add we're looking for,
00909       // however, is not, so adjust here.
00910       int Value = MI->getOperand(2).getImm() *
00911                   TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
00912                       ->getSize();
00913       Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
00914       if (Update != E) {
00915         // Merge the update into the ld/st.
00916         MBBI = mergePreIdxUpdateInsn(MBBI, Update);
00917         Modified = true;
00918         ++NumPreFolded;
00919         break;
00920       }
00921 
00922       // Nothing found. Just move to the next instruction.
00923       ++MBBI;
00924       break;
00925     }
00926       // FIXME: Do the other instructions.
00927     }
00928   }
00929 
00930   return Modified;
00931 }
00932 
00933 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
00934   const TargetMachine &TM = Fn.getTarget();
00935   TII = static_cast<const AArch64InstrInfo *>(
00936       TM.getSubtargetImpl()->getInstrInfo());
00937   TRI = TM.getSubtargetImpl()->getRegisterInfo();
00938 
00939   bool Modified = false;
00940   for (auto &MBB : Fn)
00941     Modified |= optimizeBlock(MBB);
00942 
00943   return Modified;
00944 }
00945 
00946 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
00947 // loads and stores near one another?
00948 
00949 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
00950 /// optimization pass.
00951 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
00952   return new AArch64LoadStoreOpt();
00953 }