LLVM API Documentation
00001 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file defines the pass which inserts x86 AVX vzeroupper instructions 00011 // before calls to SSE encoded functions. This avoids transition latency 00012 // penalty when tranfering control between AVX encoded instructions and old 00013 // SSE encoding mode. 00014 // 00015 //===----------------------------------------------------------------------===// 00016 00017 #include "X86.h" 00018 #include "X86InstrInfo.h" 00019 #include "X86Subtarget.h" 00020 #include "llvm/ADT/Statistic.h" 00021 #include "llvm/CodeGen/MachineFunctionPass.h" 00022 #include "llvm/CodeGen/MachineInstrBuilder.h" 00023 #include "llvm/CodeGen/MachineRegisterInfo.h" 00024 #include "llvm/CodeGen/Passes.h" 00025 #include "llvm/Support/Debug.h" 00026 #include "llvm/Support/raw_ostream.h" 00027 #include "llvm/Target/TargetInstrInfo.h" 00028 using namespace llvm; 00029 00030 #define DEBUG_TYPE "x86-vzeroupper" 00031 00032 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 00033 00034 namespace { 00035 00036 class VZeroUpperInserter : public MachineFunctionPass { 00037 public: 00038 00039 VZeroUpperInserter() : MachineFunctionPass(ID) {} 00040 bool runOnMachineFunction(MachineFunction &MF) override; 00041 const char *getPassName() const override {return "X86 vzeroupper inserter";} 00042 00043 private: 00044 00045 void processBasicBlock(MachineBasicBlock &MBB); 00046 void insertVZeroUpper(MachineBasicBlock::iterator I, 00047 MachineBasicBlock &MBB); 00048 void addDirtySuccessor(MachineBasicBlock &MBB); 00049 00050 typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; 00051 static const char* getBlockExitStateName(BlockExitState ST); 00052 00053 // Core algorithm state: 00054 // BlockState - Each block is either: 00055 // - PASS_THROUGH: There are neither YMM dirtying instructions nor 00056 // vzeroupper instructions in this block. 00057 // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this 00058 // block that will ensure that YMM is clean on exit. 00059 // - EXITS_DIRTY: An instruction in the block dirties YMM and no 00060 // subsequent vzeroupper in the block clears it. 00061 // 00062 // AddedToDirtySuccessors - This flag is raised when a block is added to the 00063 // DirtySuccessors list to ensure that it's not 00064 // added multiple times. 00065 // 00066 // FirstUnguardedCall - Records the location of the first unguarded call in 00067 // each basic block that may need to be guarded by a 00068 // vzeroupper. We won't know whether it actually needs 00069 // to be guarded until we discover a predecessor that 00070 // is DIRTY_OUT. 00071 struct BlockState { 00072 BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} 00073 BlockExitState ExitState; 00074 bool AddedToDirtySuccessors; 00075 MachineBasicBlock::iterator FirstUnguardedCall; 00076 }; 00077 typedef SmallVector<BlockState, 8> BlockStateMap; 00078 typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList; 00079 00080 BlockStateMap BlockStates; 00081 DirtySuccessorsWorkList DirtySuccessors; 00082 bool EverMadeChange; 00083 const TargetInstrInfo *TII; 00084 00085 static char ID; 00086 }; 00087 00088 char VZeroUpperInserter::ID = 0; 00089 } 00090 00091 FunctionPass *llvm::createX86IssueVZeroUpperPass() { 00092 return new VZeroUpperInserter(); 00093 } 00094 00095 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { 00096 switch (ST) { 00097 case PASS_THROUGH: return "Pass-through"; 00098 case EXITS_DIRTY: return "Exits-dirty"; 00099 case EXITS_CLEAN: return "Exits-clean"; 00100 } 00101 llvm_unreachable("Invalid block exit state."); 00102 } 00103 00104 static bool isYmmReg(unsigned Reg) { 00105 return (Reg >= X86::YMM0 && Reg <= X86::YMM15); 00106 } 00107 00108 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 00109 for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 00110 E = MRI.livein_end(); I != E; ++I) 00111 if (isYmmReg(I->first)) 00112 return true; 00113 00114 return false; 00115 } 00116 00117 static bool clobbersAllYmmRegs(const MachineOperand &MO) { 00118 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 00119 if (!MO.clobbersPhysReg(reg)) 00120 return false; 00121 } 00122 return true; 00123 } 00124 00125 static bool hasYmmReg(MachineInstr *MI) { 00126 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 00127 const MachineOperand &MO = MI->getOperand(i); 00128 if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) 00129 return true; 00130 if (!MO.isReg()) 00131 continue; 00132 if (MO.isDebug()) 00133 continue; 00134 if (isYmmReg(MO.getReg())) 00135 return true; 00136 } 00137 return false; 00138 } 00139 00140 /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this 00141 /// instruction. 00142 static bool callClobbersAnyYmmReg(MachineInstr *MI) { 00143 assert(MI->isCall() && "Can only be called on call instructions."); 00144 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 00145 const MachineOperand &MO = MI->getOperand(i); 00146 if (!MO.isRegMask()) 00147 continue; 00148 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 00149 if (MO.clobbersPhysReg(reg)) 00150 return true; 00151 } 00152 } 00153 return false; 00154 } 00155 00156 // Insert a vzeroupper instruction before I. 00157 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, 00158 MachineBasicBlock &MBB) { 00159 DebugLoc dl = I->getDebugLoc(); 00160 BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); 00161 ++NumVZU; 00162 EverMadeChange = true; 00163 } 00164 00165 // Add MBB to the DirtySuccessors list if it hasn't already been added. 00166 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { 00167 if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { 00168 DirtySuccessors.push_back(&MBB); 00169 BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; 00170 } 00171 } 00172 00173 /// processBasicBlock - Loop over all of the instructions in the basic block, 00174 /// inserting vzero upper instructions before function calls. 00175 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { 00176 00177 // Start by assuming that the block PASS_THROUGH, which implies no unguarded 00178 // calls. 00179 BlockExitState CurState = PASS_THROUGH; 00180 BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); 00181 00182 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { 00183 MachineInstr *MI = I; 00184 bool isControlFlow = MI->isCall() || MI->isReturn(); 00185 00186 // Shortcut: don't need to check regular instructions in dirty state. 00187 if (!isControlFlow && CurState == EXITS_DIRTY) 00188 continue; 00189 00190 if (hasYmmReg(MI)) { 00191 // We found a ymm-using instruction; this could be an AVX instruction, 00192 // or it could be control flow. 00193 CurState = EXITS_DIRTY; 00194 continue; 00195 } 00196 00197 // Check for control-flow out of the current function (which might 00198 // indirectly execute SSE instructions). 00199 if (!isControlFlow) 00200 continue; 00201 00202 // If the call won't clobber any YMM register, skip it as well. It usually 00203 // happens on helper function calls (such as '_chkstk', '_ftol2') where 00204 // standard calling convention is not used (RegMask is not used to mark 00205 // register clobbered and register usage (def/imp-def/use) is well-dfined 00206 // and explicitly specified. 00207 if (MI->isCall() && !callClobbersAnyYmmReg(MI)) 00208 continue; 00209 00210 // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX 00211 // registers. This instruction has zero latency. In addition, the processor 00212 // changes back to Clean state, after which execution of Intel SSE 00213 // instructions or Intel AVX instructions has no transition penalty. Add 00214 // the VZEROUPPER instruction before any function call/return that might 00215 // execute SSE code. 00216 // FIXME: In some cases, we may want to move the VZEROUPPER into a 00217 // predecessor block. 00218 if (CurState == EXITS_DIRTY) { 00219 // After the inserted VZEROUPPER the state becomes clean again, but 00220 // other YMM may appear before other subsequent calls or even before 00221 // the end of the BB. 00222 insertVZeroUpper(I, MBB); 00223 CurState = EXITS_CLEAN; 00224 } else if (CurState == PASS_THROUGH) { 00225 // If this block is currently in pass-through state and we encounter a 00226 // call then whether we need a vzeroupper or not depends on whether this 00227 // block has successors that exit dirty. Record the location of the call, 00228 // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. 00229 // It will be inserted later if necessary. 00230 BlockStates[MBB.getNumber()].FirstUnguardedCall = I; 00231 CurState = EXITS_CLEAN; 00232 } 00233 } 00234 00235 DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " 00236 << getBlockExitStateName(CurState) << '\n'); 00237 00238 if (CurState == EXITS_DIRTY) 00239 for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), 00240 SE = MBB.succ_end(); 00241 SI != SE; ++SI) 00242 addDirtySuccessor(**SI); 00243 00244 BlockStates[MBB.getNumber()].ExitState = CurState; 00245 } 00246 00247 /// runOnMachineFunction - Loop over all of the basic blocks, inserting 00248 /// vzero upper instructions before function calls. 00249 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 00250 const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); 00251 if (!ST.hasAVX() || ST.hasAVX512()) 00252 return false; 00253 TII = MF.getSubtarget().getInstrInfo(); 00254 MachineRegisterInfo &MRI = MF.getRegInfo(); 00255 EverMadeChange = false; 00256 00257 // Fast check: if the function doesn't use any ymm registers, we don't need 00258 // to insert any VZEROUPPER instructions. This is constant-time, so it is 00259 // cheap in the common case of no ymm use. 00260 bool YMMUsed = false; 00261 const TargetRegisterClass *RC = &X86::VR256RegClass; 00262 for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); 00263 i != e; i++) { 00264 if (!MRI.reg_nodbg_empty(*i)) { 00265 YMMUsed = true; 00266 break; 00267 } 00268 } 00269 if (!YMMUsed) { 00270 return false; 00271 } 00272 00273 assert(BlockStates.empty() && DirtySuccessors.empty() && 00274 "X86VZeroUpper state should be clear"); 00275 BlockStates.resize(MF.getNumBlockIDs()); 00276 00277 // Process all blocks. This will compute block exit states, record the first 00278 // unguarded call in each block, and add successors of dirty blocks to the 00279 // DirtySuccessors list. 00280 for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) 00281 processBasicBlock(*I); 00282 00283 // If any YMM regs are live in to this function, add the entry block to the 00284 // DirtySuccessors list 00285 if (checkFnHasLiveInYmm(MRI)) 00286 addDirtySuccessor(MF.front()); 00287 00288 // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add 00289 // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY 00290 // through PASS_THROUGH blocks. 00291 while (!DirtySuccessors.empty()) { 00292 MachineBasicBlock &MBB = *DirtySuccessors.back(); 00293 DirtySuccessors.pop_back(); 00294 BlockState &BBState = BlockStates[MBB.getNumber()]; 00295 00296 // MBB is a successor of a dirty block, so its first call needs to be 00297 // guarded. 00298 if (BBState.FirstUnguardedCall != MBB.end()) 00299 insertVZeroUpper(BBState.FirstUnguardedCall, MBB); 00300 00301 // If this successor was a pass-through block then it is now dirty, and its 00302 // successors need to be added to the worklist (if they haven't been 00303 // already). 00304 if (BBState.ExitState == PASS_THROUGH) { 00305 DEBUG(dbgs() << "MBB #" << MBB.getNumber() 00306 << " was Pass-through, is now Dirty-out.\n"); 00307 for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), 00308 SE = MBB.succ_end(); 00309 SI != SE; ++SI) 00310 addDirtySuccessor(**SI); 00311 } 00312 } 00313 00314 BlockStates.clear(); 00315 return EverMadeChange; 00316 }