diff options
Diffstat (limited to 'lib/Target/R600')
25 files changed, 454 insertions, 308 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index fcf9eca80e96f..c6600550126eb 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -77,7 +77,11 @@ extern Target TheGCNTarget; namespace AMDGPU { enum TargetIndex { - TI_CONSTDATA_START + TI_CONSTDATA_START, + TI_SCRATCH_RSRC_DWORD0, + TI_SCRATCH_RSRC_DWORD1, + TI_SCRATCH_RSRC_DWORD2, + TI_SCRATCH_RSRC_DWORD3 }; } diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 8a5ca613dc800..1df4448abf05e 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -92,6 +92,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "true", "Support flat address space">; +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 624f3919b409c..6185e367ff50c 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -116,7 +116,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.isAmdHsaOS()) { - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); getSIProgramInfo(KernelInfo, MF); EmitAmdKernelCodeT(MF, KernelInfo); OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1)); @@ -421,6 +420,7 @@ static unsigned getRsrcReg(unsigned ShaderType) { void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo) { + const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); @@ -441,6 +441,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer.EmitIntValue(RsrcReg, 4); OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + if (STM.isVGPRSpillingEnabled(MFI)) { + OutStreamer.EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer.EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + } } if (MFI->getShaderType() == ShaderType::PIXEL) { @@ -504,6 +508,19 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.wavefront_size = STM.getWavefrontSize(); + const MCSectionELF *VersionSection = OutContext.getELFSection(".hsa.version", + ELF::SHT_PROGBITS, 0, SectionKind::getReadOnly()); + OutStreamer.SwitchSection(VersionSection); + OutStreamer.EmitBytes(Twine("HSA Code Unit:" + + Twine(header.hsail_version_major) + "." + + Twine(header.hsail_version_minor) + ":" + + "AMD:" + + Twine(header.amd_code_version_major) + "." + + Twine(header.amd_code_version_minor) + ":" + + "GFX8.1:0").str()); + + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + if (isVerbose()) { OutStreamer.emitRawComment("amd_code_version_major = " + Twine(header.amd_code_version_major), false); diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index eaa506db96c3c..15112c7e54d4e 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -417,6 +417,28 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { N->getValueType(0), Ops); } + case ISD::LOAD: { + // To simplify the TableGen patters, we replace all i64 loads with + // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 + // during DAG legalization, however, so places (ExpandUnalignedLoad) + // in the DAG legalizer assume that if i64 is legal, so doing this + // promotion early can cause problems. + EVT VT = N->getValueType(0); + LoadSDNode *LD = cast<LoadSDNode>(N); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + break; + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); + SelectCode(NewLoad.getNode()); + N = BitCast.getNode(); + break; + } + case AMDGPUISD::REGISTER_LOAD: { if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; @@ -962,16 +984,27 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); unsigned ScratchOffsetReg = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, ScratchOffsetReg, MVT::i32); + SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); + SDValue ScratchRsrcDword0 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); - SDValue ScratchPtr = - CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64); + SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); + SDValue ScratchRsrcDword1 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); + + const SDValue RsrcOps[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + ScratchRsrcDword0, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + ScratchRsrcDword1, + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), + }; + SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, RsrcOps), 0); Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); @@ -988,22 +1021,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, } } - // (add FI, n0) - if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && - isa<FrameIndexSDNode>(Addr.getOperand(0))) { - VAddr = Addr.getOperand(1); - ImmOffset = Addr.getOperand(0); - return true; - } - - // (FI) - if (isa<FrameIndexSDNode>(Addr)) { - VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, - CurDAG->getConstant(0, MVT::i32)), 0); - ImmOffset = Addr; - return true; - } - // (node) VAddr = Addr; ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 206050d54a02a..2adcdf1c299e8 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -187,9 +187,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); - setOperationAction(ISD::LOAD, MVT::i64, Promote); - AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); - setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index 5beaa6841c946..e34a7b7345f12 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -341,8 +341,39 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { // instead. namespace llvm { namespace AMDGPU { -int getMCOpcode(uint16_t Opcode, unsigned Gen) { +static int getMCOpcode(uint16_t Opcode, unsigned Gen) { return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); } } } + +// This must be kept in sync with the SISubtarget class in SIInstrInfo.td +enum SISubtarget { + SI = 0, + VI = 1 +}; + +enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { + switch (Gen) { + default: + return SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + return VI; + } +} + +int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { + int MCOp = AMDGPU::getMCOpcode(Opcode, + AMDGPUSubtargetToSISubtarget(RI.ST.getGeneration())); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index da9833d25a52e..e28ce0f03acc6 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -135,6 +135,11 @@ public: bool isRegisterStore(const MachineInstr &MI) const; bool isRegisterLoad(const MachineInstr &MI) const; + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + //===---------------------------------------------------------------------===// // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index 1995ef2b0c9ea..03aa32d05ef0b 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -39,29 +40,17 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): Ctx(ctx), ST(st) { } -enum AMDGPUMCInstLower::SISubtarget -AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned Gen) const { - switch (Gen) { - default: - return AMDGPUMCInstLower::SI; - case AMDGPUSubtarget::VOLCANIC_ISLANDS: - return AMDGPUMCInstLower::VI; - } -} - -unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const { - - int MCOpcode = AMDGPU::getMCOpcode(MIOpcode, - AMDGPUSubtargetToSISubtarget(ST.getGeneration())); - if (MCOpcode == -1) - MCOpcode = MIOpcode; +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { - return MCOpcode; -} + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); -void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + if (MCOpcode == -1) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " + "a target-specific version: " + Twine(MI->getOpcode())); + } - OutMI.setOpcode(getMCOpcode(MI->getOpcode())); + OutMI.setOpcode(MCOpcode); for (const MachineOperand &MO : MI->explicit_operands()) { MCOperand MCOp; @@ -91,6 +80,12 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::CreateExpr(Expr); break; } + case MachineOperand::MO_ExternalSymbol: { + MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(MO.getSymbolName())); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx); + MCOp = MCOperand::CreateExpr(Expr); + break; + } } OutMI.addOperand(MCOp); } diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 0ae4d11bf1d19..d322fe072b2b5 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -19,23 +19,9 @@ class MCContext; class MCInst; class AMDGPUMCInstLower { - - // This must be kept in sync with the SISubtarget class in SIInstrInfo.td - enum SISubtarget { - SI = 0, - VI = 1 - }; - MCContext &Ctx; const AMDGPUSubtarget &ST; - /// Convert a member of the AMDGPUSubtarget::Generation enum to the - /// SISubtarget enum. - enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const; - - /// Get the MC opcode for this MachineInstr. - unsigned getMCOpcode(unsigned MIOpcode) const; - public: AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 597e558e66347..b1c7498fc1428 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -18,7 +18,9 @@ #include "R600MachineScheduler.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" using namespace llvm; @@ -78,6 +80,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, FlatAddressSpace(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + EnableVGPRSpilling(false), DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) @@ -113,3 +116,26 @@ unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { case SEA_ISLANDS: return 12; } } + +bool AMDGPUSubtarget::isVGPRSpillingEnabled( + const SIMachineFunctionInfo *MFI) const { + return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +} + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } +} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 90179d79d25d0..566b45c1dccc7 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -30,6 +30,8 @@ namespace llvm { +class SIMachineFunctionInfo; + class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { public: @@ -63,6 +65,7 @@ private: unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; + bool EnableVGPRSpilling; const DataLayout DL; AMDGPUFrameLowering FrameLowering; @@ -206,6 +209,10 @@ public: return getGeneration() <= NORTHERN_ISLANDS; } + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + // Helper functions to simplify if statements bool isTargetELF() const { return false; @@ -224,6 +231,15 @@ public: bool isAmdHsaOS() const { return TargetTriple.getOS() == Triple::AMDHSA; } + bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + llvm_unreachable("do not know max waves per CU for this subtarget."); + } }; } // End namespace llvm diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index d0c634fb7e422..5fb311b3016b1 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -29,7 +29,7 @@ public: const MCAsmLayout &Layout) override { //XXX: Implement if necessary. } - void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, + void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) override { diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 73a9c73d8e7b0..7601794beab8a 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -163,4 +163,8 @@ namespace SIOutMods { #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 #define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 +#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) + + #endif diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0a3fa2f930d7e..6b2ea0682a43e 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -588,6 +588,12 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } + + if (Info->getShaderType() != ShaderType::COMPUTE) { + unsigned ScratchIdx = CCInfo.getFirstUnallocated( + AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()); + Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + } return Chain; } diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 99a1df36c1f46..09c0cbe8f5c32 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -85,49 +85,41 @@ class Enc64 { let Uses = [EXEC] in { -class VOPCCommon <dag ins, string asm, list<dag> pattern> : - InstSI <(outs VCCReg:$dst), ins, asm, pattern> { +class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { - let DisableEncoding = "$dst"; let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let UseNamedOperandTable = 1; - let VOPC = 1; let VALU = 1; +} + +class VOPCCommon <dag ins, string asm, list<dag> pattern> : + VOPAnyCommon <(outs VCCReg:$dst), ins, asm, pattern> { + + let DisableEncoding = "$dst"; + let VOPC = 1; let Size = 4; } class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; + VOPAnyCommon <outs, ins, asm, pattern> { + let VOP1 = 1; - let VALU = 1; let Size = 4; } class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { + VOPAnyCommon <outs, ins, asm, pattern> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; let VOP2 = 1; - let VALU = 1; let Size = 4; } class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { + VOPAnyCommon <outs, ins, asm, pattern> { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; // Using complex patterns gives VOP3 patterns a very high complexity rating, // but standalone patterns are almost always prefered, so we need to adjust the // priority lower. The goal is to use a high number to reduce complexity to @@ -135,8 +127,6 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let AddedComplexity = -1000; let VOP3 = 1; - let VALU = 1; - int Size = 8; } diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 1a4c0d4e57b05..80b560eb65ae9 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -430,15 +430,6 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { return AMDGPU::COPY; } -static bool shouldTryToSpillVGPRs(MachineFunction *MF) { - - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - - // FIXME: Implement spilling for other shader types. - return MFI->getShaderType() == ShaderType::COMPUTE; - -} - void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, @@ -462,7 +453,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; } - } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { MFI->setHasSpilledVGPRs(); switch(RC->getSize() * 8) { @@ -482,7 +473,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // Place-holder registers, these will be filled in by // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) .addReg(AMDGPU::SGPR0, RegState::Undef); } else { LLVMContext &Ctx = MF->getFunction()->getContext(); @@ -499,6 +490,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); int Opcode = -1; @@ -511,7 +503,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; } - } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { switch(RC->getSize() * 8) { case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; @@ -528,7 +520,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // Place-holder registers, these will be filled in by // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) .addReg(AMDGPU::SGPR0, RegState::Undef); } else { @@ -615,7 +607,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, .addImm(-1) .addImm(0); - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), TIDReg) .addImm(-1) .addReg(TIDReg); @@ -1053,7 +1045,11 @@ bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const { } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { - return AMDGPU::getVOPe32(Opcode) != -1; + int Op32 = AMDGPU::getVOPe32(Opcode); + if (Op32 == -1) + return false; + + return pseudoToMCOpcode(Op32) != -1; } bool SIInstrInfo::hasModifiers(unsigned Opcode) const { @@ -1126,12 +1122,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: { - if (MI->getOperand(i).isImm() && - !isImmOperandLegal(MI, i, MI->getOperand(i))) { - ErrInfo = "Illegal immediate value for operand."; - return false; - } + case MCOI::OPERAND_REGISTER: + if (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case AMDGPU::OPERAND_REG_IMM32: + break; + case AMDGPU::OPERAND_REG_INLINE_C: + if (MI->getOperand(i).isImm() && !isInlineConstant(MI->getOperand(i))) { + ErrInfo = "Illegal immediate value for operand."; + return false; } break; case MCOI::OPERAND_IMMEDIATE: @@ -1287,7 +1289,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; } @@ -2278,7 +2280,7 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist MachineOperand &Dest = Inst->getOperand(0); MachineOperand &Src = Inst->getOperand(1); - const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32); + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); const TargetRegisterClass *SrcRC = Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index f766dc85e86ab..28cd27dd89622 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -325,7 +325,6 @@ namespace AMDGPU { int getVOPe32(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); - int getMCOpcode(uint16_t Opcode, unsigned Gen); int getAddr64Inst(uint16_t Opcode); int getAtomicRetOp(uint16_t Opcode); int getAtomicNoRetOp(uint16_t Opcode); diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 7cc9588c8e4b8..175e11d709cfd 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -36,6 +36,12 @@ class vop2 <bits<6> si, bits<6> vi = si> : vop { field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; } +// Specify a VOP2 opcode for SI and VOP3 opcode for VI +// that doesn't have VOP2 encoding on VI +class vop23 <bits<6> si, bits<10> vi> : vop2 <si> { + let VI3 = vi; +} + class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop { let SI3 = si; let VI3 = vi; @@ -57,7 +63,7 @@ class sopk <bits<5> si, bits<5> vi = si> { } // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum -// in AMDGPUMCInstLower.h +// in AMDGPUInstrInfo.cpp def SISubtarget { int NONE = -1; int SI = 0; @@ -731,7 +737,7 @@ class getAsm32 <int NumSrcArgs> { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. class getAsm64 <int NumSrcArgs, bit HasModifiers> { - string src0 = "$src0_modifiers,"; + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", " $src1_modifiers,")); @@ -848,6 +854,16 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : let isPseudo = 1; } +multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, string revOpSI> { + def "" : VOP2_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOpSI#"_e32", !eq(revOpSI, opName)>; + + def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, + VOP2_REV<revOpSI#"_e32_si", !eq(revOpSI, opName)>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; +} + multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOpSI, string revOpVI> { def "" : VOP2_Pseudo <outs, ins, pattern, opName>, @@ -889,16 +905,6 @@ class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : VOP3e_vi <op>, SIMCInstr <opName#"_e64", SISubtarget.VI>; -// VI only instruction -class VOP3_vi <bits<10> op, string opName, dag outs, dag ins, string asm, - list<dag> pattern, int NumSrcArgs, bit HasMods = 1> : - VOP3Common <outs, ins, asm, pattern>, - VOP <opName>, - VOP3e_vi <op>, - VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), - !if(!eq(NumSrcArgs, 2), 0, 1), - HasMods>; - multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, int NumSrcArgs, bit HasMods = 1> { @@ -998,6 +1004,23 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, } } +// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. +multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> { + let isPseudo = 1 in { + def "" : VOPAnyCommon <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE>; + } + + def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, + SIMCInstr <opName, SISubtarget.SI>; + + def _vi : VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op.VI3>, + VOP3DisableFields <1, 0, 0>, + SIMCInstr <opName, SISubtarget.VI>; +} + multiclass VOP1_Helper <vop1 op, string opName, dag outs, dag ins32, string asm32, list<dag> pat32, dag ins64, string asm64, list<dag> pat64, @@ -1089,6 +1112,33 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, revOp, P.HasModifiers >; +// A VOP2 instruction that is VOP3-only on VI. +multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOpSI, string revOpVI, bit HasMods> { + defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOpSI>; + + defm _e64 : VOP3_2_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, + revOpSI, revOpVI, HasMods>; +} + +multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOpSI = opName, string revOpVI = revOpSI> + : VOP2_VI3_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOpSI, revOpVI, P.HasModifiers +>; + class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, @@ -1224,34 +1274,6 @@ multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, P.NumSrcArgs, P.HasModifiers >; -class VOP3InstVI <bits<10> op, string opName, VOPProfile P, - SDPatternOperator node = null_frag> : VOP3_vi < - op, opName#"_vi", P.Outs, P.Ins64, opName#P.Asm64, - !if(!eq(P.NumSrcArgs, 3), - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, - P.Src2VT:$src2))]), - !if(!eq(P.NumSrcArgs, 2), - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) - /* P.NumSrcArgs == 1 */, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), - P.NumSrcArgs, P.HasModifiers ->; - multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, string opName, list<dag> pattern> : VOP3b_2_m < diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index e05b6bb7d0f12..4b1a84662cb57 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1525,25 +1525,25 @@ defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", } // End Uses = [VCC] } // End isCommutable = 1, Defs = [VCC] -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { - -def V_READLANE_B32 : VOP2 < - 0x00000001, +defm V_READLANE_B32 : VOP2SI_3VI_m < + vop3 <0x001, 0x289>, + "v_readlane_b32", (outs SReg_32:$vdst), (ins VGPR_32:$src0, SSrc_32:$vsrc1), - "v_readlane_b32 $vdst, $src0, $vsrc1", - [] + "v_readlane_b32 $vdst, $src0, $vsrc1" >; -def V_WRITELANE_B32 : VOP2 < - 0x00000002, +defm V_WRITELANE_B32 : VOP2SI_3VI_m < + vop3 <0x002, 0x28a>, + "v_writelane_b32", (outs VGPR_32:$vdst), (ins SReg_32:$src0, SSrc_32:$vsrc1), - "v_writelane_b32 $vdst, $src0, $vsrc1", - [] + "v_writelane_b32 $vdst, $src0, $vsrc1" >; +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32", VOP_F32_F32_F32 @@ -1568,30 +1568,33 @@ defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>; } } // End isCommutable = 1 +} // End let SubtargetPredicate = SICI -defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", VOP_I32_I32_I32, - AMDGPUbfm>; -defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>; -defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32", +defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32, + AMDGPUbfm +>; +defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", VOP_I32_I32_I32 >; -defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32", +defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", VOP_I32_I32_I32 >; -defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32", +defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp >; ////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>; ////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>; ////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32", +defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16 >; ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>; ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>; -} // End let SubtargetPredicate = SICI //===----------------------------------------------------------------------===// // VOP3 Instructions //===----------------------------------------------------------------------===// @@ -1656,9 +1659,6 @@ defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", VOP_I32_I32_I32_I32 >; -// Only on SI -defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", - VOP_F32_F32_F32_F32>; defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32", VOP_F32_F32_F32_F32, AMDGPUfmin3>; @@ -1699,20 +1699,6 @@ defm V_DIV_FIXUP_F64 : VOP3Inst < } // let SchedRW = [WriteDouble] -defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", - VOP_I64_I64_I32, shl ->; - -// Only on SI -defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", - VOP_I64_I64_I32, srl ->; - -// Only on SI -defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", - VOP_I64_I64_I32, sra ->; - let SchedRW = [WriteDouble] in { let isCommutable = 1 in { @@ -1785,6 +1771,26 @@ defm V_TRIG_PREOP_F64 : VOP3Inst < } // let SchedRW = [WriteDouble] +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", + VOP_I64_I64_I32, shl +>; + +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", + VOP_I64_I64_I32, srl +>; + +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", + VOP_I64_I64_I32, sra +>; + +defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", + VOP_F32_F32_F32_F32>; + +} // End SubtargetPredicate = isSICI + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// @@ -1943,14 +1949,14 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { let UseNamedOperandTable = 1 in { def _SAVE : InstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr, + (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] >; def _RESTORE : InstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] >; } // End UseNamedOperandTable = 1 @@ -1966,14 +1972,14 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { let UseNamedOperandTable = 1 in { def _SAVE : InstSI < (outs), - (ins vgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr, + (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] >; def _RESTORE : InstSI < (outs vgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] >; } // End UseNamedOperandTable = 1 @@ -2728,16 +2734,12 @@ def : Pat < (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; -let Predicates = [isSICI] in { - def : Pat < (int_SI_tid), - (V_MBCNT_HI_U32_B32_e32 0xffffffff, + (V_MBCNT_HI_U32_B32_e64 0xffffffff, (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) >; -} - //===----------------------------------------------------------------------===// // VOP3 Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 71852717d7e68..667da4c8af615 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -50,6 +50,7 @@ public: unsigned NumUserSGPRs; std::map<unsigned, unsigned> LaneVGPRs; unsigned LDSWaveSpillSize; + unsigned ScratchOffsetReg; bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp index f0e7edec6b488..0a57a5bc201ad 100644 --- a/lib/Target/R600/SIPrepareScratchRegs.cpp +++ b/lib/Target/R600/SIPrepareScratchRegs.cpp @@ -84,28 +84,10 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) Entry->addLiveIn(ScratchOffsetPreloadReg); - // Load the scratch pointer - unsigned ScratchPtrReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass); - int ScratchPtrFI = -1; - - if (ScratchPtrReg != AMDGPU::NoRegister) { - // Found an SGPR to use. - MRI.setPhysRegUsed(ScratchPtrReg); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B64), ScratchPtrReg) - .addReg(ScratchPtrPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchPtrFI = FrameInfo->CreateSpillStackObject(8, 4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S64_SAVE)) - .addReg(ScratchPtrPreloadReg) - .addFrameIndex(ScratchPtrFI); - } - // Load the scratch offset. unsigned ScratchOffsetReg = TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = ~0; + int ScratchOffsetFI = -1; if (ScratchOffsetReg != AMDGPU::NoRegister) { // Found an SGPR to use @@ -117,7 +99,9 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI); + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); } @@ -125,22 +109,27 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { // add them to all the SI_SPILL_V* instructions. RegScavenger RS; - bool UseRegScavenger = - (ScratchPtrReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister); + unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); + RS.addScavengingFrameIndex(ScratchRsrcFI); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; - if (UseRegScavenger) - RS.enterBasicBlock(&MBB); + // Add the scratch offset reg as a live-in so that the register scavenger + // doesn't re-use it. + if (!MBB.isLiveIn(ScratchOffsetReg) && + ScratchOffsetReg != AMDGPU::NoRegister) + MBB.addLiveIn(ScratchOffsetReg); + RS.enterBasicBlock(&MBB); for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { MachineInstr &MI = *I; + RS.forward(I); DebugLoc DL = MI.getDebugLoc(); switch(MI.getOpcode()) { - default: break;; + default: break; case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: @@ -153,43 +142,66 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: - // Scratch Pointer - if (ScratchPtrReg == AMDGPU::NoRegister) { - ScratchPtrReg = RS.scavengeRegister(&AMDGPU::SGPR_64RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S64_RESTORE), - ScratchPtrReg) - .addFrameIndex(ScratchPtrFI) - .addReg(AMDGPU::NoRegister) - .addReg(AMDGPU::NoRegister); - } else if (!MBB.isLiveIn(ScratchPtrReg)) { - MBB.addLiveIn(ScratchPtrReg); - } + // Scratch resource + unsigned ScratchRsrcReg = + RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + // Scratch Offset if (ScratchOffsetReg == AMDGPU::NoRegister) { ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), ScratchOffsetReg) .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::NoRegister) - .addReg(AMDGPU::NoRegister); + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); } else if (!MBB.isLiveIn(ScratchOffsetReg)) { MBB.addLiveIn(ScratchOffsetReg); } - if (ScratchPtrReg == AMDGPU::NoRegister || + if (ScratchRsrcReg == AMDGPU::NoRegister || ScratchOffsetReg == AMDGPU::NoRegister) { LLVMContext &Ctx = MF.getFunction()->getContext(); Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchPtrReg = AMDGPU::SGPR0; + ScratchRsrcReg = AMDGPU::SGPR0; ScratchOffsetReg = AMDGPU::SGPR0; } - MI.getOperand(2).setReg(ScratchPtrReg); + MI.getOperand(2).setReg(ScratchRsrcReg); + MI.getOperand(2).setIsKill(true); + MI.getOperand(2).setIsUndef(false); MI.getOperand(3).setReg(ScratchOffsetReg); + MI.getOperand(3).setIsUndef(false); + MI.getOperand(3).setIsKill(false); + MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); break; } - if (UseRegScavenger) - RS.forward(); } } return true; diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index f9feea470f15e..0396bf384066d 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -23,7 +23,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/Support/Debug.h" using namespace llvm; SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) @@ -51,9 +50,32 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } -unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - return RC->getNumRegs(); +unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const { + + // FIXME: We should adjust the max number of waves based on LDS size. + unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU()); + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + + unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned Limit; + + if (isSGPRClass(*I)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(*I); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { @@ -98,7 +120,7 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, - unsigned ScratchPtr, + unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const { @@ -113,33 +135,9 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, bool RanOutOfSGPRs = false; unsigned SOffset = ScratchOffset; - unsigned RsrcReg = RS->scavengeRegister(&AMDGPU::SReg_128RegClass, MI, 0); - if (RsrcReg == AMDGPU::NoRegister) { - RanOutOfSGPRs = true; - RsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - } - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned Size = NumSubRegs * 4; - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; // Size - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B64), - getSubReg(RsrcReg, AMDGPU::sub0_sub1)) - .addReg(ScratchPtr) - .addReg(RsrcReg, RegState::ImplicitDefine); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), - getSubReg(RsrcReg, AMDGPU::sub2)) - .addImm(Rsrc & 0xffffffff) - .addReg(RsrcReg, RegState::ImplicitDefine); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), - getSubReg(RsrcReg, AMDGPU::sub3)) - .addImm(Rsrc >> 32) - .addReg(RsrcReg, RegState::ImplicitDefine); - if (!isUInt<12>(Offset + Size)) { SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); if (SOffset == AMDGPU::NoRegister) { @@ -163,9 +161,9 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(RsrcReg, getKillRegState(IsKill)) + .addReg(ScratchRsrcReg, getKillRegState(IsKill)) .addImm(Offset) - .addReg(SOffset, getKillRegState(IsKill)) + .addReg(SOffset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe @@ -235,9 +233,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } - if (isM0) { + if (isM0) SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); - } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) @@ -262,7 +259,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V32_SAVE: buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), FrameInfo->getObjectOffset(Index), RS); MI->eraseFromParent(); @@ -274,7 +271,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V512_RESTORE: { buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), FrameInfo->getObjectOffset(Index), RS); MI->eraseFromParent(); @@ -289,7 +286,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); - FIOp.ChangeToRegister(TmpReg, false); + FIOp.ChangeToRegister(TmpReg, false, false, true); } } } @@ -446,6 +443,8 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::TGID_Z: return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + if (MFI->getShaderType() != ShaderType::COMPUTE) + return MFI->ScratchOffsetReg; return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); case SIRegisterInfo::SCRATCH_PTR: return AMDGPU::SGPR2_SGPR3; @@ -475,3 +474,29 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return AMDGPU::NoRegister; } +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } +} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index d14212c2b1045..d908ffd12d2ca 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "llvm/Support/Debug.h" namespace llvm { @@ -26,8 +27,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { BitVector getReservedRegs(const MachineFunction &MF) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const override; + unsigned getRegPressureSetLimit(unsigned Idx) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; @@ -105,13 +105,21 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const; + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(unsigned WaveCount) const; + unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, - unsigned ScratchPtr, unsigned ScratchOffset, + unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const; }; diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp index f91d1177bbaec..6a3410688fe75 100644 --- a/lib/Target/R600/SIShrinkInstructions.cpp +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -10,6 +10,7 @@ // #include "AMDGPU.h" +#include "AMDGPUMCInstLower.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/ADT/Statistic.h" @@ -206,13 +207,13 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } - int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); - - // Op32 could be -1 here if we started with an instruction that had a + // getVOPe32 could be -1 here if we started with an instruction that had // a 32-bit encoding and then commuted it to an instruction that did not. - if (Op32 == -1) + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td index 07cfa29ae12b0..24e66cea62779 100644 --- a/lib/Target/R600/VIInstructions.td +++ b/lib/Target/R600/VIInstructions.td @@ -11,22 +11,6 @@ let SubtargetPredicate = isVI in { -def V_LDEXP_F32 : VOP3InstVI <0x288, "v_ldexp_f32", VOP_F32_F32_I32, - AMDGPUldexp ->; -def V_BFM_B32 : VOP3InstVI <0x293, "v_bfm_b32", VOP_I32_I32_I32, AMDGPUbfm>; -def V_BCNT_U32_B32 : VOP3InstVI <0x28b, "v_bcnt_u32_b32", VOP_I32_I32_I32>; -def V_MBCNT_LO_U32_B32 : VOP3InstVI <0x28c, "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32 ->; -def V_MBCNT_HI_U32_B32 : VOP3InstVI <0x28d, "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 ->; - -def V_CVT_PKRTZ_F16_F32 : VOP3InstVI <0x296, "v_cvt_pkrtz_f16_f32", - VOP_I32_F32_F32, int_SI_packf16 ->; - defm BUFFER_LOAD_DWORD_VI : MUBUF_Load_Helper_vi < 0x14, "buffer_load_dword", VGPR_32, i32, global_load >; @@ -37,22 +21,13 @@ defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi < } // End SubtargetPredicate = isVI -//===----------------------------------------------------------------------===// -// VOP2 Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isVI] in { - -def : Pat < - (int_SI_tid), - (V_MBCNT_HI_U32_B32 0xffffffff, - (V_MBCNT_LO_U32_B32 0xffffffff, 0)) ->; //===----------------------------------------------------------------------===// // SMEM Patterns //===----------------------------------------------------------------------===// +let Predicates = [isVI] in { + // 1. Offset as 8bit DWORD immediate def : Pat < (SIload_constant v4i32:$sbase, IMM20bit:$offset), |