diff options
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPU.td | 5 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 19 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 16 | ||||
-rw-r--r-- | lib/Target/AMDGPU/Processors.td | 4 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 59 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.td | 2 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstructions.td | 7 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIPrepareScratchRegs.cpp | 1 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIRegisterInfo.cpp | 5 | ||||
-rw-r--r-- | lib/Target/AMDGPU/VIInstructions.td | 42 |
14 files changed, 134 insertions, 35 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index ef8ef6268548..68b50504ee44 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -123,6 +123,11 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "true", "VI SGPR initilization bug requiring a fixed SGPR allocation size">; +def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer", + "EnableHugeScratchBuffer", + "true", + "Enable scratch buffer sizes greater than 128 GB">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 37b77d778d9f..64c54ccb31ff 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1029,6 +1029,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &SLC, SDValue &TFE) const { SDValue Ptr, Offen, Idxen, Addr64; + // addr64 bit was removed for volcanic islands. + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return false; + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, GLC, SLC, TFE); @@ -1095,13 +1099,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - - if (isLegalMUBUFImmOffset(C1)) { - VAddr = Addr.getOperand(0); - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; + // Offsets in vaddr must be positive. + if (CurDAG->SignBitIsZero(N0)) { + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isLegalMUBUFImmOffset(C1)) { + VAddr = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } } } diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index bd5abc4f546e..5f32a65c9338 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -73,7 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), - IsaVersion(ISAVersion0_0_0), + IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 90831bfb4458..735f01dfa7c5 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -89,6 +89,7 @@ private: bool FeatureDisable; int LDSBankCount; unsigned IsaVersion; + bool EnableHugeScratchBuffer; AMDGPUFrameLowering FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; @@ -271,6 +272,10 @@ public: return DevName; } + bool enableHugeScratchBuffer() const { + return EnableHugeScratchBuffer; + } + bool dumpCode() const { return DumpCode; } diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index c9b25a1a0b84..d918ac3a5b3b 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1719,7 +1719,6 @@ MachineBasicBlock * AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { MachineBasicBlock *LoopHeader = LoopRep->getHeader(); MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); - const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); if (!LoopHeader || !LoopLatch) return nullptr; @@ -1732,18 +1731,9 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { FuncRep->push_back(DummyExitBlk); //insert to function SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); - MachineBasicBlock::iterator I = BranchMI; - unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra register needed to handle CFG"); - MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); - MachineInstrBuilder MIB(*FuncRep, NewMI); - MIB.addMBB(LoopHeader); - MIB.addReg(ImmReg, false); - SHOWNEWINSTR(NewMI); - BranchMI->eraseFromParent(); - LoopLatch->addSuccessor(DummyExitBlk); - - return DummyExitBlk; + LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext(); + Ctx.emitError("Extra register needed to handle CFG"); + return nullptr; } void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index 69efb8b8bc43..d9a0723bedc9 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -138,3 +138,7 @@ def : ProcessorModel<"iceland", SIQuarterSpeedModel, def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1] >; + +def : ProcessorModel<"fiji", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index dd818a9ba746..099b0b15942b 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -254,6 +254,12 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, return false; } +bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); +} + bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { @@ -263,8 +269,21 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM); + } + // fall-through case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and // additionally can do r + r + i with addr64. 32-bit has more addressing @@ -324,11 +343,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } - case AMDGPUAS::FLAT_ADDRESS: { - // Flat instructions do not have offsets, and only have the register - // address. - return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); - } + case AMDGPUAS::FLAT_ADDRESS: + return isLegalFlatAddressingMode(AM); + default: llvm_unreachable("unhandled address space"); } @@ -812,10 +829,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FINode->getIndex(); - return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); + // A FrameIndex node represents a 32-bit offset into scratch memory. If + // the high bit of a frame index offset were to be set, this would mean + // that it represented an offset of ~2GB * 64 = ~128GB from the start of the + // scratch buffer, with 64 being the number of threads per wave. + // + // If we know the machine uses less than 128GB of scratch, then we can + // amrk the high bit of the FrameIndex node as known zero, + // which is important, because it means in most situations we can + // prove that values derived from FrameIndex nodes are non-negative. + // This enables us to take advantage of more addressing modes when + // accessing scratch buffers, since for scratch reads/writes, the register + // offset must always be positive. + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); + if (Subtarget->enableHugeScratchBuffer()) + return TFI; + + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); } /// This transforms the control flow intrinsics to get the branch destination as @@ -2034,6 +2070,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } +static bool isFrameIndexOp(SDValue Op) { + if (Op.getOpcode() == ISD::AssertZext) + Op = Op.getOperand(0); + + return isa<FrameIndexSDNode>(Op); +} + /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. @@ -2042,7 +2085,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, SmallVector<SDValue, 8> Ops; for (unsigned i = 0; i < Node->getNumOperands(); ++i) { - if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { + if (!isFrameIndexOp(Node->getOperand(i))) { Ops.push_back(Node->getOperand(i)); continue; } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 635b4edc89de..d84c32ec0092 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -56,6 +56,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; + bool isLegalFlatAddressingMode(const AddrMode &AM) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index b39a78714640..8d8110bca4c5 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1600,12 +1600,14 @@ multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, SIMCInstr <opName#"_e32", SISubtarget.SI> { let Defs = !if(DefExec, [EXEC], []); let hasSideEffects = DefExec; + let AssemblerPredicates = [isSICI]; } def _vi : VOPC<op.VI, ins, asm, []>, SIMCInstr <opName#"_e32", SISubtarget.VI> { let Defs = !if(DefExec, [EXEC], []); let hasSideEffects = DefExec; + let AssemblerPredicates = [isVI]; } } diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 1ee63c675822..f78ffd72314c 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -2910,9 +2910,6 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; } // End Predicates = [isSICI] class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < @@ -3273,13 +3270,13 @@ def : Pat < (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), (V_CNDMASK_B64_PSEUDO - $x, (V_MIN_F64 SRCMODS.NONE, (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), SRCMODS.NONE, (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), + $x, (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) >; @@ -3291,13 +3288,13 @@ def : Pat < $x, SRCMODS.NEG, (V_CNDMASK_B64_PSEUDO - $x, (V_MIN_F64 SRCMODS.NONE, (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), SRCMODS.NONE, (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), + $x, (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), DSTCLAMP.NONE, DSTOMOD.NONE) >; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d23b92edef33..587ea63d6796 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,6 +53,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp index b086d2ed6652..0a7f684552f0 100644 --- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -91,6 +91,7 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { if (ScratchOffsetReg != AMDGPU::NoRegister) { // Found an SGPR to use + MRI.setPhysRegUsed(ScratchOffsetReg); BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) .addReg(ScratchOffsetPreloadReg); } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index ce4acafac9fa..54c4d549fac7 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -348,7 +348,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::SReg_128RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, - &AMDGPU::VReg_512RegClass + &AMDGPU::VReg_512RegClass, + &AMDGPU::SReg_512RegClass }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -499,7 +500,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E; ++I) { - if (MRI.reg_nodbg_empty(*I)) + if (!MRI.isPhysRegUsed(*I)) return *I; } return AMDGPU::NoRegister; diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index 5bf86e649ce0..aca46732adb9 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -103,4 +103,46 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) >; +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; + +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + + } // End Predicates = [isVI] |