diff options
Diffstat (limited to 'lib/Target/AMDGPU')
24 files changed, 408 insertions, 150 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 569ad3844b25..ef8ef6268548 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -98,6 +98,16 @@ def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", "true", "Enable SI load/store optimizer pass">; +// Performance debugging feature. Allow using DS instruction immediate +// offsets even if the base pointer can't be proven to be base. On SI, +// base pointer values that won't give the same result as a 16-bit add +// are not safe to fold, but this will override the conservative test +// for the base pointer. +def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding", + "EnableUnsafeDSOffsetFolding", + "true", + "Force using DS instruction immediate offsets on SI">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 0b426bc63dd5..ad267d350850 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -22,7 +22,6 @@ using namespace llvm; namespace { class AMDGPUAlwaysInline : public ModulePass { - static char ID; public: @@ -36,10 +35,9 @@ public: char AMDGPUAlwaysInline::ID = 0; bool AMDGPUAlwaysInline::runOnModule(Module &M) { + std::vector<Function *> FuncsToClone; - std::vector<Function*> FuncsToClone; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; + for (Function &F : M) { if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && !F.hasFnAttribute(Attribute::NoInline)) FuncsToClone.push_back(&F); @@ -49,12 +47,11 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { ValueToValueMapTy VMap; Function *NewFunc = CloneFunction(F, VMap, false); NewFunc->setLinkage(GlobalValue::InternalLinkage); - F->getParent()->getFunctionList().push_back(NewFunc); + M.getFunctionList().push_back(NewFunc); F->replaceAllUsesWith(NewFunc); } - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; + for (Function &F : M) { if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { F.addFnAttr(Attribute::AlwaysInline); } diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index df4461eac4db..37b77d778d9f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -110,8 +110,11 @@ private: SDValue &Offset, SDValue &GLC) const; SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const; @@ -859,7 +862,8 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, (OffsetBits == 8 && !isUInt<8>(Offset))) return false; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS || + Subtarget->unsafeDSOffsetFoldingEnabled()) return true; // On Southern Islands instruction with a negative base value and an offset @@ -1316,6 +1320,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + bool Res = SelectVOP3Mods(In, Src, SrcMods); + return Res && cast<ConstantSDNode>(SrcMods)->isNullValue(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { @@ -1327,6 +1337,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); + + return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() && + cast<ConstantSDNode>(Clamp)->isNullValue() && + cast<ConstantSDNode>(Omod)->isNullValue(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const { @@ -1351,18 +1371,14 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() { do { IsModified = false; // Go over all selected nodes and try to fold them a bit more - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); I != E; ++I) { - - SDNode *Node = I; - - MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I); + for (SDNode &Node : CurDAG->allnodes()) { + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); if (!MachineNode) continue; SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != Node) { - ReplaceUses(Node, ResNode); + if (ResNode != &Node) { + ReplaceUses(&Node, ResNode); IsModified = true; } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index d56838ec2019..3a65f3b56146 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -406,6 +406,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); @@ -444,7 +445,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, // Target Information //===----------------------------------------------------------------------===// -MVT AMDGPUTargetLowering::getVectorIdxTy() const { +MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -545,9 +546,8 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { - const DataLayout *DL = getDataLayout(); - unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); - unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + unsigned SrcSize = Src->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); return SrcSize == 32 && DestSize == 64; } @@ -697,7 +697,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const { - const DataLayout *TD = getDataLayout(); + const DataLayout &TD = DAG.getDataLayout(); SDLoc DL(InitPtr); Type *InitTy = Init->getType(); @@ -705,20 +705,20 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(InitTy)); } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { EVT VT = EVT::getEVT(CFP->getType()); PointerType *PtrTy = PointerType::get(CFP->getType(), 0); return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(CFP->getType())); + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(CFP->getType())); } if (StructType *ST = dyn_cast<StructType>(InitTy)) { - const StructLayout *SL = TD->getStructLayout(ST); + const StructLayout *SL = TD.getStructLayout(ST); EVT PtrVT = InitPtr.getValueType(); SmallVector<SDValue, 8> Chains; @@ -745,7 +745,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, else llvm_unreachable("Unexpected type"); - unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); + unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType()); SmallVector<SDValue, 8> Chains; for (unsigned i = 0; i < NumElements; ++i) { SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); @@ -762,8 +762,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), false, + false, TD.getPrefTypeAlignment(InitTy)); } Init->dump(); @@ -785,7 +785,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { - const DataLayout *TD = getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); @@ -801,7 +801,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Offset; if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); Offset = MFI->LDSSize; MFI->LocalMemoryObjects[GV] = Offset; // XXX: Account for alignment? @@ -811,16 +811,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); + getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); } case AMDGPUAS::CONSTANT_ADDRESS: { MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); Type *EltType = GV->getType()->getElementType(); - unsigned Size = TD->getTypeAllocSize(EltType); - unsigned Alignment = TD->getPrefTypeAlignment(EltType); + unsigned Size = DL.getTypeAllocSize(EltType); + unsigned Alignment = DL.getPrefTypeAlignment(EltType); - MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); - MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); int FI = FrameInfo->CreateStackObject(Size, Alignment, false); SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); @@ -1653,7 +1653,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // fb = fabs(fb); fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // int cv = fr >= fb; SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); @@ -1960,7 +1960,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); @@ -2020,7 +2021,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); @@ -2051,7 +2053,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); @@ -2081,7 +2084,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); @@ -2100,8 +2104,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const const SDValue One = DAG.getConstant(1, SL, MVT::i32); const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); @@ -2172,7 +2176,8 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); @@ -2411,6 +2416,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } +SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + // i64 (shl x, 32) -> (build_pair 0, x) + + // Doing this with moves theoretically helps MI optimizations that understand + // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as + // v_lshl_b64. In the SALU case, I think this is slightly worse since it + // doubles the code size and I'm unsure about cycle count. + const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS || RHS->getZExtValue() != 32) + return SDValue(); + + SDValue LHS = N->getOperand(0); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + + // Extract low 32-bits. + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -2448,17 +2480,24 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, SDLoc DL(N); switch(N->getOpcode()) { - default: break; - case ISD::MUL: - return performMulCombine(N, DCI); - case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); - return SDValue(); - } + default: + break; + case ISD::SHL: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performShlCombine(N, DCI); + } + case ISD::MUL: + return performMulCombine(N, DCI); + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } case ISD::SELECT: { SDValue Cond = N->getOperand(0); if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { @@ -2644,6 +2683,18 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getRegister(VirtualRegister, VT); } +uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( + const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { + uint64_t ArgOffset = MFI->ABIArgOffset; + switch (Param) { + case GRID_DIM: + return ArgOffset; + case GRID_OFFSET: + return ArgOffset + 4; + } + llvm_unreachable("unexpected implicit parameter type"); +} + #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fbb7d3c88437..478b2035fd75 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -65,6 +65,7 @@ private: SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: @@ -123,7 +124,7 @@ public: bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - MVT getVectorIdxTy() const override; + MVT getVectorIdxTy(const DataLayout &) const override; bool isSelectSupported(SelectSupportKind) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -207,6 +208,16 @@ public: virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const; + + enum ImplicitParameter { + GRID_DIM, + GRID_OFFSET + }; + + /// \brief Helper function that returns the byte offset of the given + /// type of implicit parameter. + unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + const ImplicitParameter Param) const; }; namespace AMDGPUISD { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0779d1d786b2..bd5abc4f546e 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -69,6 +69,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), + EnableUnsafeDSOffsetFolding(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 30f50eb1d2f3..90831bfb4458 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ private: bool EnablePromoteAlloca; bool EnableIfCvt; bool EnableLoadStoreOpt; + bool EnableUnsafeDSOffsetFolding; unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; @@ -222,6 +223,10 @@ public: return EnableLoadStoreOpt; } + bool unsafeDSOffsetFoldingEnabled() const { + return EnableUnsafeDSOffsetFolding; + } + unsigned getWavefrontSize() const { return WavefrontSize; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index a9a911a8efed..2297b52b423c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -156,8 +156,10 @@ public: } // End of anonymous namespace TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); + return TargetIRAnalysis([this](Function &F) { + return TargetTransformInfo( + AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); + }); } void AMDGPUPassConfig::addIRPasses() { @@ -269,6 +271,7 @@ void GCNPassConfig::addPreRegAlloc() { // also need extra copies to the address operand to be eliminated. initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + insertPass(&MachineSchedulerID, &RegisterCoalescerID); } addPass(createSIShrinkInstructionsPass(), false); addPass(createSIFixSGPRLiveRangesPass(), false); @@ -280,10 +283,10 @@ void GCNPassConfig::addPostRegAlloc() { } void GCNPassConfig::addPreSched2() { - addPass(createSIInsertWaits(*TM), false); } void GCNPassConfig::addPreEmitPass() { + addPass(createSIInsertWaits(*TM), false); addPass(createSILowerControlFlowPass(*TM), false); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 791c84e6f28b..dee0a69d1e68 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -37,8 +37,9 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { const AMDGPUTargetLowering *getTLI() const { return TLI; } public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) - : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL) + : BaseT(TM, DL), ST(TM->getSubtargetImpl()), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) @@ -46,18 +47,6 @@ public: AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} - AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { - BaseT::operator=(static_cast<const BaseT &>(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } bool hasBranchDivergence() { return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 7172e4bb9335..c709741f3777 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -44,7 +44,7 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() { return X; } -static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { +static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); InitAMDGPUMCRegisterInfo(X, 0); return X; @@ -52,14 +52,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { static MCSubtargetInfo * createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo * X = new MCSubtargetInfo(); - InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); - return X; + return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT, + Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); X->initMCCodeGenInfo(RM, CM, OL); return X; diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 8357b6d9d0ed..4e4d554f0ee7 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -815,8 +815,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::AMDGPU_read_workdim: - return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); + case Intrinsic::AMDGPU_read_workdim: { + uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); + return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); + } case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, @@ -897,8 +899,9 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, - DAG.getConstant(i, DL, getVectorIdxTy()))); + Args.push_back(DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); } return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); @@ -1459,22 +1462,17 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const SDValue Ptr = Op.getOperand(1); SDValue LoweredLoad; - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - if (Ret.getNode()) { - SDValue Ops[2] = { - Ret, - Chain - }; - return DAG.getMergeValues(Ops, DL); - } + if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) + return Ret; // Lower loads constant address space global variable loads if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && isa<GlobalVariable>(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { + LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { - SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + SDValue Ptr = DAG.getZExtOrTrunc( + LoadNode->getBasePtr(), DL, + getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), @@ -1702,7 +1700,8 @@ SDValue R600TargetLowering::LowerFormalArguments( return Chain; } -EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index c06d3c4fd309..4dbac97af2a1 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -38,7 +38,9 @@ public: const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const override; + private: unsigned Gen; /// Each OpenCL kernel has nine implicit parameters that are stored in the diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index d14e37a64612..c2887255cc11 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -126,11 +126,42 @@ static bool updateOperand(FoldCandidate &Fold, return false; } +static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList, + const MachineInstr *MI) { + for (auto Candidate : FoldList) { + if (Candidate.UseMI == MI) + return true; + } + return false; +} + static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + + // Special case for v_mac_f32_e64 if we are trying to fold into src2 + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_MAC_F32_e64 && + (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + // Check if changing this to a v_mad_f32 instruction will allow us to + // fold the operand. + MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); + if (FoldAsMAD) { + MI->untieRegOperand(OpNo); + return true; + } + MI->setDesc(TII->get(Opc)); + } + + // If we are already folding into another operand of MI, then + // we can't commute the instruction, otherwise we risk making the + // other fold illegal. + if (isUseMIInFoldList(FoldList, MI)) + return false; + // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. unsigned CommuteIdx0; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ead1a3743473..dd818a9ba746 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -254,8 +254,9 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, return false; } -bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const { +bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { // No global is ever allowed as a base. if (AM.BaseGV) return false; @@ -416,7 +417,7 @@ static EVT toIntegerVT(EVT VT) { SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { - const DataLayout *DL = getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); @@ -425,16 +426,16 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, Type *Ty = VT.getTypeForEVT(*DAG.getContext()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); - SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + SDValue PtrOffset = DAG.getUNDEF(PtrVT); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - unsigned Align = DL->getABITypeAlignment(Ty); + unsigned Align = DL.getABITypeAlignment(Ty); if (VT != MemVT && VT.isFloatingPoint()) { // Do an integer load and convert. @@ -451,7 +452,12 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, true, // isNonTemporal true, // isInvariant Align); // Alignment - return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); + SDValue Ops[] = { + DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), + Load.getValue(1) + }; + + return DAG.getMergeValues(Ops, SL); } ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; @@ -569,6 +575,8 @@ SDValue SITargetLowering::LowerFormalArguments( AnalyzeFormalArguments(CCInfo, Splits); + SmallVector<SDValue, 16> Chains; + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; @@ -587,8 +595,9 @@ SDValue SITargetLowering::LowerFormalArguments( VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt()); + Chains.push_back(Arg.getValue(1)); const PointerType *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -614,7 +623,8 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, &AMDGPU::SReg_64RegClass); Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); - InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Copy); continue; } @@ -634,7 +644,9 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned j = 1; j != NumElements; ++j) { Reg = ArgLocs[ArgIdx++].getLocReg(); Reg = MF.addLiveIn(Reg, RC); - Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); } // Fill up the missing vector elements @@ -653,7 +665,11 @@ SDValue SITargetLowering::LowerFormalArguments( AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); } - return Chain; + + if (Chains.empty()) + return Chain; + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( @@ -695,14 +711,15 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { return true; } -EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { +EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, + EVT VT) const { if (!VT.isVector()) { return MVT::i1; } return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); } -MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { +MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { return MVT::i32; } @@ -888,7 +905,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDLoc DL(GSD); const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(GSD->getAddressSpace()); + MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); @@ -926,6 +943,7 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + auto MFI = MF.getInfo<SIMachineFunctionInfo>(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); @@ -964,8 +982,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::AMDGPU_read_workdim: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, - false); + getImplicitParameterOffset(MFI, GRID_DIM), false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, @@ -1213,7 +1230,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); @@ -1411,7 +1429,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, unsigned AS = Load->getAddressSpace(); unsigned Align = Load->getAlignment(); Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); // Don't try to replace the load if we have to expand it due to alignment // problems. Otherwise we will end up scalarizing the load, and trying to @@ -2212,9 +2230,8 @@ SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint_, + StringRef Constraint, MVT VT) const { - StringRef Constraint(Constraint_); if (Constraint == "r") { switch(VT.SimpleTy) { default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index a956b013bdb1..635b4edc89de 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -62,8 +62,8 @@ public: bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, EVT /*VT*/) const override; - bool isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, @@ -90,8 +90,9 @@ public: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - MVT getScalarShiftAmountTy(EVT VT) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; @@ -114,9 +115,9 @@ public: SDLoc DL, SDValue Ptr) const; - std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, - const std::string &Constraint, MVT VT) const override; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index eb96bd0227b2..18910615bebe 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -227,9 +227,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, uint8_t Offset0 = Offset0Imm->getImm(); uint8_t Offset1 = Offset1Imm->getImm(); - assert(Offset1 > Offset0); - if (Offset1 - Offset0 == 1) { + if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { // Each of these offsets is in element sized units, so we need to convert // to bytes of the individual reads. @@ -924,7 +923,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, return false; unsigned Opc = UseMI->getOpcode(); - if (Opc == AMDGPU::V_MAD_F32) { + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || @@ -963,9 +962,9 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); @@ -980,6 +979,14 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, Src1->setSubReg(Src2SubReg); Src1->setIsKill(Src2->isKill()); + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2)); + // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); removeModOperands(*UseMI); @@ -1010,11 +1017,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. @@ -1126,6 +1139,38 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, return false; } +MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const { + + switch (MI->getOpcode()) { + default: return nullptr; + case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } + } + + const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + + return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { int64_t SVal = Imm.getSExtValue(); if (SVal >= -16 && SVal <= 64) @@ -1625,7 +1670,10 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? + MRI.getRegClass(MO->getReg()) : + RI.getPhysRegClass(MO->getReg()); // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 0382272068d2..015ea12d4598 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -144,6 +144,10 @@ public: unsigned getMachineCSELookAheadLimit() const override { return 500; } + MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const override; + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index fcb58d5da3b0..b39a78714640 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -529,9 +529,11 @@ def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; +def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">; //===----------------------------------------------------------------------===// // SI assembler operands @@ -1113,6 +1115,13 @@ def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); field string Asm = "$dst, $src0, $vsrc1, $src2"; } +def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); + let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, + HasModifiers>.ret; + let Asm32 = getAsm32<2>.ret; + let Asm64 = getAsm64<2, HasModifiers>.ret; +} def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 8c8d836776db..1ee63c675822 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1488,7 +1488,10 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; -defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; +let Constraints = "$dst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>; +} } // End isCommutable = 1 defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; @@ -2206,6 +2209,15 @@ def : Pat < (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; +// Pattern for V_MAC_F32 +def : Pat < + (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 9b1d256dc5a8..1bdb1f0ee9f9 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -214,12 +214,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); - unsigned DestReg1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); + const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); + const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; @@ -258,20 +257,43 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; - updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); - updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); - LIS->RemoveMachineInstrFromMaps(I); - // Replacing Paired in the maps with Read2 allows us to avoid updating the - // live range for the m0 register. - LIS->ReplaceMachineInstrInMaps(Paired, Read2); + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + + // Copy to the old destination registers. + MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + .addOperand(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + .addOperand(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + LIS->InsertMachineInstrInMaps(Read2); + + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + + // The new write to the original destination register is now the copy. Steal + // the old SlotIndex. + LIS->ReplaceMachineInstrInMaps(I, Copy0); + LIS->ReplaceMachineInstrInMaps(Paired, Copy1); + I->eraseFromParent(); Paired->eraseFromParent(); LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); - LIS->getInterval(DestReg); // Create new LI + LIS->createAndComputeVirtRegInterval(DestReg); + + if (UpdateM0Range) { + SlotIndex Read2Index = LIS->getInstructionIndex(Read2); + M0Segment->end = Read2Index.getRegSlot(); + } DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Read2.getInstr(); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 587ea63d6796..d23b92edef33 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,7 +53,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp index 0a7f684552f0..b086d2ed6652 100644 --- a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -91,7 +91,6 @@ bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { if (ScratchOffsetReg != AMDGPU::NoRegister) { // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) .addReg(ScratchOffsetPreloadReg); } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index db2ff0b1f952..ce4acafac9fa 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -499,7 +499,7 @@ unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) + if (MRI.reg_nodbg_empty(*I)) return *I; } return AMDGPU::NoRegister; diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 51e72cdb5f9e..5d00bdd6a9bb 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -94,8 +94,20 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, // is vcc. We should handle this the same way we handle vopc, by addding // a register allocation hint pre-regalloc and then do the shrining // post-regalloc. - if (Src2) - return false; + if (Src2) { + switch (MI.getOpcode()) { + default: return false; + + case AMDGPU::V_MAC_F32_e64: + if (!isVGPR(Src2, TRI, MRI) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) + return false; + break; + + case AMDGPU::V_CNDMASK_B32_e64: + break; + } + } const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src1Mod = @@ -149,7 +161,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, return; // Try to fold Src0 - if (Src0.isReg()) { + if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { unsigned Reg = Src0.getReg(); MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { @@ -243,6 +255,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { + // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC + // instructions. + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (!Src2->isReg()) + continue; + unsigned SReg = Src2->getReg(); + if (TargetRegisterInfo::isVirtualRegister(SReg)) { + MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); + continue; + } + if (SReg != AMDGPU::VCC) + continue; + } + // We can shrink this instruction DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); @@ -259,6 +287,11 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (Src1) Inst32.addOperand(*Src1); + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) + Inst32.addOperand(*Src2); + ++NumInstructionsShrunk; MI.eraseFromParent(); |
