LLVM API Documentation

R600ExpandSpecialInstrs.cpp
Go to the documentation of this file.
00001 //===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 /// \file
00011 /// Vector, Reduction, and Cube instructions need to fill the entire instruction
00012 /// group to work correctly.  This pass expands these individual instructions
00013 /// into several instructions that will completely fill the instruction group.
00014 //
00015 //===----------------------------------------------------------------------===//
00016 
00017 #include "AMDGPU.h"
00018 #include "R600Defines.h"
00019 #include "R600InstrInfo.h"
00020 #include "R600MachineFunctionInfo.h"
00021 #include "R600RegisterInfo.h"
00022 #include "AMDGPUSubtarget.h"
00023 #include "llvm/CodeGen/MachineFunctionPass.h"
00024 #include "llvm/CodeGen/MachineInstrBuilder.h"
00025 #include "llvm/CodeGen/MachineRegisterInfo.h"
00026 
00027 using namespace llvm;
00028 
00029 namespace {
00030 
00031 class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
00032 
00033 private:
00034   static char ID;
00035   const R600InstrInfo *TII;
00036 
00037   void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
00038       unsigned Op);
00039 
00040 public:
00041   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
00042     TII(nullptr) { }
00043 
00044   bool runOnMachineFunction(MachineFunction &MF) override;
00045 
00046   const char *getPassName() const override {
00047     return "R600 Expand special instructions pass";
00048   }
00049 };
00050 
00051 } // End anonymous namespace
00052 
00053 char R600ExpandSpecialInstrsPass::ID = 0;
00054 
00055 FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
00056   return new R600ExpandSpecialInstrsPass(TM);
00057 }
00058 
00059 void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
00060     const MachineInstr *OldMI, unsigned Op) {
00061   int OpIdx = TII->getOperandIdx(*OldMI, Op);
00062   if (OpIdx > -1) {
00063     uint64_t Val = OldMI->getOperand(OpIdx).getImm();
00064     TII->setImmOperand(NewMI, Op, Val);
00065   }
00066 }
00067 
00068 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
00069   TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
00070 
00071   const R600RegisterInfo &TRI = TII->getRegisterInfo();
00072 
00073   for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
00074                                                   BB != BB_E; ++BB) {
00075     MachineBasicBlock &MBB = *BB;
00076     MachineBasicBlock::iterator I = MBB.begin();
00077     while (I != MBB.end()) {
00078       MachineInstr &MI = *I;
00079       I = std::next(I);
00080 
00081       // Expand LDS_*_RET instructions
00082       if (TII->isLDSRetInstr(MI.getOpcode())) {
00083         int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
00084         assert(DstIdx != -1);
00085         MachineOperand &DstOp = MI.getOperand(DstIdx);
00086         MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
00087                                                DstOp.getReg(), AMDGPU::OQAP);
00088         DstOp.setReg(AMDGPU::OQAP);
00089         int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
00090                                            AMDGPU::OpName::pred_sel);
00091         int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
00092                                            AMDGPU::OpName::pred_sel);
00093         // Copy the pred_sel bit
00094         Mov->getOperand(MovPredSelIdx).setReg(
00095             MI.getOperand(LDSPredSelIdx).getReg());
00096       }
00097 
00098       switch (MI.getOpcode()) {
00099       default: break;
00100       // Expand PRED_X to one of the PRED_SET instructions.
00101       case AMDGPU::PRED_X: {
00102         uint64_t Flags = MI.getOperand(3).getImm();
00103         // The native opcode used by PRED_X is stored as an immediate in the
00104         // third operand.
00105         MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
00106                                             MI.getOperand(2).getImm(), // opcode
00107                                             MI.getOperand(0).getReg(), // dst
00108                                             MI.getOperand(1).getReg(), // src0
00109                                             AMDGPU::ZERO);             // src1
00110         TII->addFlag(PredSet, 0, MO_FLAG_MASK);
00111         if (Flags & MO_FLAG_PUSH) {
00112           TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
00113         } else {
00114           TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
00115         }
00116         MI.eraseFromParent();
00117         continue;
00118         }
00119 
00120       case AMDGPU::INTERP_PAIR_XY: {
00121         MachineInstr *BMI;
00122         unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
00123                 MI.getOperand(2).getImm());
00124 
00125         for (unsigned Chan = 0; Chan < 4; ++Chan) {
00126           unsigned DstReg;
00127 
00128           if (Chan < 2)
00129             DstReg = MI.getOperand(Chan).getReg();
00130           else
00131             DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
00132 
00133           BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
00134               DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
00135 
00136           if (Chan > 0) {
00137             BMI->bundleWithPred();
00138           }
00139           if (Chan >= 2)
00140             TII->addFlag(BMI, 0, MO_FLAG_MASK);
00141           if (Chan != 3)
00142             TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
00143         }
00144 
00145         MI.eraseFromParent();
00146         continue;
00147         }
00148 
00149       case AMDGPU::INTERP_PAIR_ZW: {
00150         MachineInstr *BMI;
00151         unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
00152                 MI.getOperand(2).getImm());
00153 
00154         for (unsigned Chan = 0; Chan < 4; ++Chan) {
00155           unsigned DstReg;
00156 
00157           if (Chan < 2)
00158             DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
00159           else
00160             DstReg = MI.getOperand(Chan-2).getReg();
00161 
00162           BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
00163               DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
00164 
00165           if (Chan > 0) {
00166             BMI->bundleWithPred();
00167           }
00168           if (Chan < 2)
00169             TII->addFlag(BMI, 0, MO_FLAG_MASK);
00170           if (Chan != 3)
00171             TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
00172         }
00173 
00174         MI.eraseFromParent();
00175         continue;
00176         }
00177 
00178       case AMDGPU::INTERP_VEC_LOAD: {
00179         const R600RegisterInfo &TRI = TII->getRegisterInfo();
00180         MachineInstr *BMI;
00181         unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
00182                 MI.getOperand(1).getImm());
00183         unsigned DstReg = MI.getOperand(0).getReg();
00184 
00185         for (unsigned Chan = 0; Chan < 4; ++Chan) {
00186           BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
00187               TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
00188           if (Chan > 0) {
00189             BMI->bundleWithPred();
00190           }
00191           if (Chan != 3)
00192             TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
00193         }
00194 
00195         MI.eraseFromParent();
00196         continue;
00197         }
00198       case AMDGPU::DOT_4: {
00199 
00200         const R600RegisterInfo &TRI = TII->getRegisterInfo();
00201 
00202         unsigned DstReg = MI.getOperand(0).getReg();
00203         unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
00204 
00205         for (unsigned Chan = 0; Chan < 4; ++Chan) {
00206           bool Mask = (Chan != TRI.getHWRegChan(DstReg));
00207           unsigned SubDstReg =
00208               AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
00209           MachineInstr *BMI =
00210               TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
00211           if (Chan > 0) {
00212             BMI->bundleWithPred();
00213           }
00214           if (Mask) {
00215             TII->addFlag(BMI, 0, MO_FLAG_MASK);
00216           }
00217           if (Chan != 3)
00218             TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
00219           unsigned Opcode = BMI->getOpcode();
00220           // While not strictly necessary from hw point of view, we force
00221           // all src operands of a dot4 inst to belong to the same slot.
00222           unsigned Src0 = BMI->getOperand(
00223               TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
00224               .getReg();
00225           unsigned Src1 = BMI->getOperand(
00226               TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
00227               .getReg();
00228           (void) Src0;
00229           (void) Src1;
00230           if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
00231               (TRI.getEncodingValue(Src1) & 0xff) < 127)
00232             assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
00233         }
00234         MI.eraseFromParent();
00235         continue;
00236       }
00237       }
00238 
00239       bool IsReduction = TII->isReductionOp(MI.getOpcode());
00240       bool IsVector = TII->isVector(MI);
00241       bool IsCube = TII->isCubeOp(MI.getOpcode());
00242       if (!IsReduction && !IsVector && !IsCube) {
00243         continue;
00244       }
00245 
00246       // Expand the instruction
00247       //
00248       // Reduction instructions:
00249       // T0_X = DP4 T1_XYZW, T2_XYZW
00250       // becomes:
00251       // TO_X = DP4 T1_X, T2_X
00252       // TO_Y (write masked) = DP4 T1_Y, T2_Y
00253       // TO_Z (write masked) = DP4 T1_Z, T2_Z
00254       // TO_W (write masked) = DP4 T1_W, T2_W
00255       //
00256       // Vector instructions:
00257       // T0_X = MULLO_INT T1_X, T2_X
00258       // becomes:
00259       // T0_X = MULLO_INT T1_X, T2_X
00260       // T0_Y (write masked) = MULLO_INT T1_X, T2_X
00261       // T0_Z (write masked) = MULLO_INT T1_X, T2_X
00262       // T0_W (write masked) = MULLO_INT T1_X, T2_X
00263       //
00264       // Cube instructions:
00265       // T0_XYZW = CUBE T1_XYZW
00266       // becomes:
00267       // TO_X = CUBE T1_Z, T1_Y
00268       // T0_Y = CUBE T1_Z, T1_X
00269       // T0_Z = CUBE T1_X, T1_Z
00270       // T0_W = CUBE T1_Y, T1_Z
00271       for (unsigned Chan = 0; Chan < 4; Chan++) {
00272         unsigned DstReg = MI.getOperand(
00273                             TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
00274         unsigned Src0 = MI.getOperand(
00275                            TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
00276         unsigned Src1 = 0;
00277 
00278         // Determine the correct source registers
00279         if (!IsCube) {
00280           int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
00281           if (Src1Idx != -1) {
00282             Src1 = MI.getOperand(Src1Idx).getReg();
00283           }
00284         }
00285         if (IsReduction) {
00286           unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
00287           Src0 = TRI.getSubReg(Src0, SubRegIndex);
00288           Src1 = TRI.getSubReg(Src1, SubRegIndex);
00289         } else if (IsCube) {
00290           static const int CubeSrcSwz[] = {2, 2, 0, 1};
00291           unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
00292           unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
00293           Src1 = TRI.getSubReg(Src0, SubRegIndex1);
00294           Src0 = TRI.getSubReg(Src0, SubRegIndex0);
00295         }
00296 
00297         // Determine the correct destination registers;
00298         bool Mask = false;
00299         bool NotLast = true;
00300         if (IsCube) {
00301           unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
00302           DstReg = TRI.getSubReg(DstReg, SubRegIndex);
00303         } else {
00304           // Mask the write if the original instruction does not write to
00305           // the current Channel.
00306           Mask = (Chan != TRI.getHWRegChan(DstReg));
00307           unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
00308           DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
00309         }
00310 
00311         // Set the IsLast bit
00312         NotLast = (Chan != 3 );
00313 
00314         // Add the new instruction
00315         unsigned Opcode = MI.getOpcode();
00316         switch (Opcode) {
00317         case AMDGPU::CUBE_r600_pseudo:
00318           Opcode = AMDGPU::CUBE_r600_real;
00319           break;
00320         case AMDGPU::CUBE_eg_pseudo:
00321           Opcode = AMDGPU::CUBE_eg_real;
00322           break;
00323         default:
00324           break;
00325         }
00326 
00327         MachineInstr *NewMI =
00328           TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
00329 
00330         if (Chan != 0)
00331           NewMI->bundleWithPred();
00332         if (Mask) {
00333           TII->addFlag(NewMI, 0, MO_FLAG_MASK);
00334         }
00335         if (NotLast) {
00336           TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
00337         }
00338         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
00339         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
00340         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
00341         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
00342         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
00343         SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
00344       }
00345       MI.eraseFromParent();
00346     }
00347   }
00348   return false;
00349 }