diff options
Diffstat (limited to 'lib/Target')
87 files changed, 1651 insertions, 723 deletions
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 5ddf66654a67..da68f3165c5e 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -315,8 +315,14 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; // AArch64 Instruction Predicate Definitions. def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; -def ForCodeSize : Predicate<"Subtarget->getForCodeSize()">; -def NotForCodeSize : Predicate<"!Subtarget->getForCodeSize()">; + +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">; + def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">; +} include "AArch64InstrFormats.td" diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 1c81d34014fd..b369ee7e4ba2 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -164,12 +164,11 @@ struct AArch64GISelActualAccessor : public GISelAccessor { AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - const TargetMachine &TM, bool LittleEndian, - bool ForCodeSize) + const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), - TLInfo(TM, *this), GISel(), ForCodeSize(ForCodeSize) { + TLInfo(TM, *this), GISel() { #ifndef LLVM_BUILD_GLOBAL_ISEL GISelAccessor *AArch64GISel = new GISelAccessor(); #else diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index df54bf3f48e1..7933e58c49ee 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -128,8 +128,6 @@ protected: /// an optional library. std::unique_ptr<GISelAccessor> GISel; - bool ForCodeSize; - private: /// initializeSubtargetDependencies - Initializes using CPUString and the /// passed in feature string so that we can use initializer lists for @@ -145,7 +143,7 @@ public: /// of the specified triple. AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, - bool LittleEndian, bool ForCodeSize); + bool LittleEndian); /// This object will take onwership of \p GISelAccessor. void setGISelAccessor(GISelAccessor &GISel) { @@ -274,8 +272,6 @@ public: } } - bool getForCodeSize() const { return ForCodeSize; } - /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 5a90fd1eb1ba..132f192f2a9a 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -214,7 +214,6 @@ const AArch64Subtarget * AArch64TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); Attribute FSAttr = F.getFnAttribute("target-features"); - bool ForCodeSize = F.optForSize(); std::string CPU = !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString().str() @@ -222,17 +221,15 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { std::string FS = !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString().str() : TargetFS; - std::string ForCodeSizeStr = - std::string(ForCodeSize ? "+" : "-") + "forcodesize"; - auto &I = SubtargetMap[CPU + FS + ForCodeSizeStr]; + auto &I = SubtargetMap[CPU + FS]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, - isLittle, ForCodeSize); + isLittle); } return I.get(); } @@ -324,7 +321,7 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { void AArch64PassConfig::addIRPasses() { // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg // ourselves. - addPass(createAtomicExpandPass(TM)); + addPass(createAtomicExpandPass()); // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in @@ -343,7 +340,7 @@ void AArch64PassConfig::addIRPasses() { // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createInterleavedAccessPass(TM)); + addPass(createInterleavedAccessPass()); if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { // Call SeparateConstOffsetFromGEP pass to extract constants within indices diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 3f89702bed50..78ff3bbe3d1a 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -27,12 +27,12 @@ class PassRegistry; class Module; // R600 Passes -FunctionPass *createR600VectorRegMerger(TargetMachine &tm); -FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600VectorRegMerger(); +FunctionPass *createR600ExpandSpecialInstrsPass(); FunctionPass *createR600EmitClauseMarkers(); -FunctionPass *createR600ClauseMergePass(TargetMachine &tm); -FunctionPass *createR600Packetizer(TargetMachine &tm); -FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); +FunctionPass *createR600ClauseMergePass(); +FunctionPass *createR600Packetizer(); +FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes @@ -42,24 +42,24 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); -FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); +FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); -FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); +FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); extern char &AMDGPUMachineCFGStructurizerID; -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr); +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; -ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr); +ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; @@ -97,7 +97,7 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; // Passes common to R600 and SI -FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); +FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 3d8db7cd8af5..7235d8fae332 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -27,7 +28,6 @@ namespace { class AMDGPUAnnotateKernelFeatures : public ModulePass { private: - const TargetMachine *TM; AMDGPUAS AS; static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS); @@ -37,8 +37,7 @@ private: public: static char ID; - AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) : - ModulePass(ID), TM(TM_) {} + AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {} bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; @@ -221,8 +220,10 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { if (F.hasFnAttribute("amdgpu-queue-ptr")) continue; - bool HasApertureRegs = - TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs(); + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>() + .getSubtarget<AMDGPUSubtarget>(F) + .hasApertureRegs(); if (!HasApertureRegs && hasAddrSpaceCast(F, AS)) F.addFnAttr("amdgpu-queue-ptr"); } @@ -231,6 +232,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) { - return new AMDGPUAnnotateKernelFeatures(TM); +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { + return new AMDGPUAnnotateKernelFeatures(); } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index 09bdf8ffcde7..251cb7a2c440 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -38,7 +38,8 @@ class AMDGPUCallLowering: public CallLowering { unsigned VReg) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; - CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); + static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); }; } // End of namespace llvm; #endif diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index d308f718aae1..4bef7a89bfe3 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -13,6 +13,8 @@ // Inversion of CCIfInReg class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {} +class CCIfExtend<CCAction A> + : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>; // Calling convention for SI def CC_SI : CallingConv<[ @@ -52,7 +54,7 @@ def CC_SI : CallingConv<[ ]>>> ]>; -def RetCC_SI : CallingConv<[ +def RetCC_SI_Shader : CallingConv<[ CCIfType<[i32] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, @@ -99,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[ CCCustom<"allocateKernArg"> ]>; +def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs< + (sequence "VGPR%u", 24, 255) +>; + +def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs< + (sequence "VGPR%u", 32, 255) +>; + +def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs< + (sequence "SGPR%u", 32, 103) +>; + +def CSR_AMDGPU_HighRegs : CalleeSavedRegs< + (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103) +>; + +// Calling convention for leaf functions +def CC_AMDGPU_Func : CallingConv<[ + CCIfByVal<CCPassByVal<4, 4>>, + CCIfType<[i1], CCPromoteToType<i32>>, + CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, + CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, + CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> +]>; + +// Calling convention for leaf functions +def RetCC_AMDGPU_Func : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i32>>, + CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, + CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">> +]>; + def CC_AMDGPU : CallingConv<[ CCIf<"static_cast<const AMDGPUSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() >=" diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index e19314fe0a6c..d923cb117c12 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -48,7 +49,6 @@ namespace { class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { - const GCNTargetMachine *TM; const SISubtarget *ST = nullptr; DivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; @@ -127,8 +127,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, public: static char ID; - AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : - FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {} + AMDGPUCodeGenPrepare() : FunctionPass(ID) {} bool visitFDiv(BinaryOperator &I); @@ -487,10 +486,15 @@ bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { } bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { - if (!TM || skipFunction(F)) + if (skipFunction(F)) return false; - ST = &TM->getSubtarget<SISubtarget>(F); + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM<TargetMachine>(); + ST = &TM.getSubtarget<SISubtarget>(F); DA = &getAnalysis<DivergenceAnalysis>(); HasUnsafeFPMath = hasUnsafeFPMath(F); @@ -507,14 +511,14 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { return MadeChange; } -INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) -INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, - "AMDGPU IR optimizations", false, false) +INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", + false, false) char AMDGPUCodeGenPrepare::ID = 0; -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { - return new AMDGPUCodeGenPrepare(TM); +FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { + return new AMDGPUCodeGenPrepare(); } diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index c3ac796a0a44..19fce064783d 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -82,7 +82,7 @@ public: void PostprocessISelDAG() override; private: - SDValue foldFrameIndex(SDValue N) const; + std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, @@ -116,9 +116,11 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + bool SelectMUBUFScratchOffen(SDNode *Root, + SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + bool SelectMUBUFScratchOffset(SDNode *Root, + SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, @@ -1074,13 +1076,33 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } -SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { - if (auto FI = dyn_cast<FrameIndexSDNode>(N)) - return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - return N; +static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { + auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); + return PSV && PSV->isStack(); +} + +std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + const MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + + // If we can resolve this to a frame index access, this is relative to the + // frame pointer SGPR. + return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(), + MVT::i32)); + } + + // If we don't know this private access is a local stack object, it needs to + // be relative to the entry point's scratch wave offset register. + return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), + MVT::i32)); } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, + SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -1089,7 +1111,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc, const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { unsigned Imm = CAddr->getZExtValue(); @@ -1100,6 +1121,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc, MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); VAddr = SDValue(MovHighBits, 0); + + // In a call sequence, stores to the argument stack area are relative to the + // stack pointer. + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? + Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); + + SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1113,19 +1142,20 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc, // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast<ConstantSDNode>(N1); if (isLegalMUBUFImmOffset(C1)) { - VAddr = foldFrameIndex(N0); + std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } // (node) - VAddr = foldFrameIndex(Addr); + std::tie(VAddr, SOffset) = foldFrameIndex(Addr); ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, + SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { @@ -1138,7 +1168,15 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr, const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); + + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? + Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); + + // FIXME: Get from MachinePointerInfo? We should only be using the frame + // offset if we know this is in a call sequence. + SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; } @@ -1700,12 +1738,46 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, return true; } +static SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; +} + +// Figure out if this is really an extract of the high 16-bits of a dword. +static bool isExtractHiElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue Srl = In.getOperand(0); + if (Srl.getOpcode() == ISD::SRL) { + if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { + if (ShiftAmt->getZExtValue() == 16) { + Out = stripBitcast(Srl.getOperand(0)); + return true; + } + } + } + + return false; +} + +// Look through operations that obscure just looking at the low 16-bits of the +// same register. +static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::TRUNCATE) { + SDValue Src = In.getOperand(0); + if (Src.getValueType().getSizeInBits() == 32) + return stripBitcast(Src); + } + + return In; +} + bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; Src = In; - // FIXME: Look for on separate components if (Src.getOpcode() == ISD::FNEG) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = Src.getOperand(0); @@ -1714,19 +1786,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, if (Src.getOpcode() == ISD::BUILD_VECTOR) { unsigned VecMods = Mods; - SDValue Lo = Src.getOperand(0); - SDValue Hi = Src.getOperand(1); + SDValue Lo = stripBitcast(Src.getOperand(0)); + SDValue Hi = stripBitcast(Src.getOperand(1)); if (Lo.getOpcode() == ISD::FNEG) { - Lo = Lo.getOperand(0); + Lo = stripBitcast(Lo.getOperand(0)); Mods ^= SISrcMods::NEG; } if (Hi.getOpcode() == ISD::FNEG) { - Hi = Hi.getOperand(0); + Hi = stripBitcast(Hi.getOperand(0)); Mods ^= SISrcMods::NEG_HI; } + if (isExtractHiElt(Lo, Lo)) + Mods |= SISrcMods::OP_SEL_0; + + if (isExtractHiElt(Hi, Hi)) + Mods |= SISrcMods::OP_SEL_1; + + Lo = stripExtractLoElt(Lo); + Hi = stripExtractLoElt(Hi); + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { // Really a scalar input. Just select from the low half of the register to // avoid packing. @@ -1740,9 +1821,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, } // Packed instructions do not have abs modifiers. - - // FIXME: Handle abs/neg of individual components. - // FIXME: Handle swizzling with op_sel Mods |= SISrcMods::OP_SEL_1; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f80652b87373..5ec46a8294c0 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -76,6 +76,45 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, } } +// Allocate up to VGPR31. +// +// TODO: Since there are no VGPR alignent requirements would it be better to +// split into individual scalar registers? +static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + switch (LocVT.SimpleTy) { + case MVT::i64: + case MVT::f64: + case MVT::v2i32: + case MVT::v2f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_64RegClass, 31); + } + case MVT::v4i32: + case MVT::v4f32: + case MVT::v2i64: + case MVT::v2f64: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_128RegClass, 29); + } + case MVT::v8i32: + case MVT::v8f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_256RegClass, 25); + + } + case MVT::v16i32: + case MVT::v16f32: { + return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + &AMDGPU::VReg_512RegClass, 17); + + } + default: + return false; + } +} + #include "AMDGPUGenCallingConv.inc" // Find a larger type to do a load / store of a vector with. @@ -773,8 +812,43 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { //===---------------------------------------------------------------------===// CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, - bool IsVarArg) const { - return CC_AMDGPU; + bool IsVarArg) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return CC_AMDGPU; + case CallingConv::C: + case CallingConv::Fast: + return CC_AMDGPU_Func; + default: + report_fatal_error("Unsupported calling convention."); + } +} + +CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, + bool IsVarArg) { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return RetCC_SI_Shader; + case CallingConv::C: + case CallingConv::Fast: + return RetCC_AMDGPU_Func; + default: + report_fatal_error("Unsupported calling convention."); + } } /// The SelectionDAGBuilder will automatically promote function arguments @@ -874,18 +948,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, } } -void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, - const SmallVectorImpl<ISD::OutputArg> &Outs) const { - - State.AnalyzeReturn(Outs, RetCC_SI); -} - -SDValue -AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SDLoc &DL, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { + // FIXME: Fails for r600 tests + //assert(!isVarArg && Outs.empty() && OutVals.empty() && + // "wave terminate should not have return values"); return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); } @@ -896,20 +967,12 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) { - switch (CC) { - case CallingConv::C: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return CC_AMDGPU_Kernel; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - return CC_AMDGPU; - default: - report_fatal_error("Unsupported calling convention."); - } + return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); +} + +CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, + bool IsVarArg) { + return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); } SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, @@ -2532,27 +2595,49 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (N->getValueType(0) != MVT::i64) + EVT VT = N->getValueType(0); + if (VT != MVT::i64) return SDValue(); - // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) - - // On some subtargets, 64-bit shift is a quarter rate instruction. In the - // common case, splitting this into a move and a 32-bit shift is faster and - // the same code size. - const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!RHS) return SDValue(); - unsigned RHSVal = RHS->getZExtValue(); - if (RHSVal < 32) - return SDValue(); - SDValue LHS = N->getOperand(0); + unsigned RHSVal = RHS->getZExtValue(); + if (!RHSVal) + return LHS; SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; + switch (LHS->getOpcode()) { + default: + break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + // shl (ext x) => zext (shl x), if shift does not overflow int + KnownBits Known; + SDValue X = LHS->getOperand(0); + DAG.computeKnownBits(X, Known); + unsigned LZ = Known.countMinLeadingZeros(); + if (LZ < RHSVal) + break; + EVT XVT = X.getValueType(); + SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); + return DAG.getZExtOrTrunc(Shl, SL, VT); + } + } + + // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) + + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + if (RHSVal < 32) + return SDValue(); + SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 4c588a7bafd0..fb2f15022d25 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -115,9 +115,6 @@ protected: SmallVectorImpl<SDValue> &Results) const; void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; - void AnalyzeReturn(CCState &State, - const SmallVectorImpl<ISD::OutputArg> &Outs) const; - public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); @@ -164,6 +161,8 @@ public: bool isCheapToSpeculateCtlz() const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); + static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 353cc5742791..e286558ce60d 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -380,6 +380,6 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, +def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index dcb6670621ee..846e7dff5f8c 100644 --- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -25,15 +26,13 @@ const unsigned MaxStaticSize = 1024; class AMDGPULowerIntrinsics : public ModulePass { private: - const TargetMachine *TM; - bool makeLIDRangeMetadata(Function &F) const; public: static char ID; - AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr) - : ModulePass(ID), TM(TM) { } + AMDGPULowerIntrinsics() : ModulePass(ID) {} + bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Lower Intrinsics"; @@ -46,8 +45,8 @@ char AMDGPULowerIntrinsics::ID = 0; char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; -INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, - "Lower intrinsics", false, false) +INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, + false) // TODO: Should refine based on estimated number of accesses (e.g. does it // require splitting based on alignment) @@ -104,11 +103,13 @@ static bool expandMemIntrinsicUses(Function &F) { } bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { - if (!TM) + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) return false; + const TargetMachine &TM = TPC->getTM<TargetMachine>(); + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F); bool Changed = false; - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); for (auto *U : F.users()) { auto *CI = dyn_cast<CallInst>(U); @@ -155,6 +156,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) { - return new AMDGPULowerIntrinsics(TM); +ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { + return new AMDGPULowerIntrinsics(); } diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index da247fea7de6..f1ef6281c90f 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -126,9 +126,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + unsigned Opcode = MI->getOpcode(); - int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); + // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We + // need to select it to the subtarget specific version, and there's no way to + // do that with a single pseudo source operation. + if (Opcode == AMDGPU::S_SETPC_B64_return) + Opcode = AMDGPU::S_SETPC_B64; + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index fe7283ccf7d9..9fb7f5f88927 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -12,21 +12,6 @@ using namespace llvm; -static bool isEntryFunctionCC(CallingConv::ID CC) { - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - return true; - default: - return false; - } -} - AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), LocalMemoryObjects(), @@ -34,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())), + IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index e40f39557747..85184b363905 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -99,8 +100,7 @@ private: public: static char ID; - AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) : - FunctionPass(ID), TM(TM_) {} + AMDGPUPromoteAlloca() : FunctionPass(ID) {} bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -119,30 +119,31 @@ public: char AMDGPUPromoteAlloca::ID = 0; -INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, - "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; bool AMDGPUPromoteAlloca::doInitialization(Module &M) { - if (!TM) - return false; - Mod = &M; DL = &Mod->getDataLayout(); - const Triple &TT = TM->getTargetTriple(); - - IsAMDGCN = TT.getArch() == Triple::amdgcn; - IsAMDHSA = TT.getOS() == Triple::AMDHSA; - return false; } bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - if (!TM || skipFunction(F)) + if (skipFunction(F)) return false; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) + TM = &TPC->getTM<TargetMachine>(); + else + return false; + + const Triple &TT = TM->getTargetTriple(); + IsAMDGCN = TT.getArch() == Triple::amdgcn; + IsAMDHSA = TT.getOS() == Triple::AMDHSA; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); if (!ST.isPromoteAllocaEnabled()) return false; @@ -874,6 +875,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { } } -FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) { - return new AMDGPUPromoteAlloca(TM); +FunctionPass *llvm::createAMDGPUPromoteAlloca() { + return new AMDGPUPromoteAlloca(); } diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 941f2d8a468a..b2867fcc49f9 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -14,6 +14,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUTargetMachine.h" +#include "SIRegisterInfo.h" using namespace llvm; @@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -// Dummy to not crash RegisterClassInfo. -static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; - -const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs( - const MachineFunction *) const { - return &CalleeSavedReg; -} - -unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; -} - unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { static const unsigned SubRegs[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, @@ -50,3 +39,35 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" + + +// Forced to be here by one .inc +const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( + const MachineFunction *MF) const { + CallingConv::ID CC = MF->getFunction()->getCallingConv(); + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + return CSR_AMDGPU_HighRegs_SaveList; + default: { + // Dummy to not crash RegisterClassInfo. + static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; + return &NoCalleeSavedReg; + } + } +} + +const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + case CallingConv::Fast: + return CSR_AMDGPU_HighRegs_RegMask; + default: + return nullptr; + } +} + +unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 22b1663821d9..d8604d2590f1 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -30,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) unsigned getSubRegFromChannel(unsigned Channel) const; - - const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 386a88b0520f..a9d3a31a7240 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -570,7 +570,7 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); - addPass(createAMDGPULowerIntrinsicsPass(&TM)); + addPass(createAMDGPULowerIntrinsicsPass()); // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); @@ -585,8 +585,7 @@ void AMDGPUPassConfig::addIRPasses() { if (TM.getTargetTriple().getArch() == Triple::amdgcn) { // TODO: May want to move later or split into an early and late one. - addPass(createAMDGPUCodeGenPreparePass( - static_cast<const GCNTargetMachine *>(&TM))); + addPass(createAMDGPUCodeGenPreparePass()); } // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. @@ -594,7 +593,7 @@ void AMDGPUPassConfig::addIRPasses() { if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); - addPass(createAMDGPUPromoteAlloca(&TM)); + addPass(createAMDGPUPromoteAlloca()); if (EnableSROA) addPass(createSROAPass()); @@ -664,22 +663,22 @@ bool R600PassConfig::addPreISel() { } void R600PassConfig::addPreRegAlloc() { - addPass(createR600VectorRegMerger(*TM)); + addPass(createR600VectorRegMerger()); } void R600PassConfig::addPreSched2() { addPass(createR600EmitClauseMarkers(), false); if (EnableR600IfConvert) addPass(&IfConverterID, false); - addPass(createR600ClauseMergePass(*TM), false); + addPass(createR600ClauseMergePass(), false); } void R600PassConfig::addPreEmitPass() { addPass(createAMDGPUCFGStructurizerPass(), false); - addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(createR600ExpandSpecialInstrsPass(), false); addPass(&FinalizeMachineBundlesID, false); - addPass(createR600Packetizer(*TM), false); - addPass(createR600ControlFlowFinalizer(*TM), false); + addPass(createR600Packetizer(), false); + addPass(createR600ControlFlowFinalizer(), false); } TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { @@ -703,8 +702,7 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. - const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); - addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM)); + addPass(createAMDGPUAnnotateKernelFeaturesPass()); // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 70c848f3c7bd..b52ea2b3a2c6 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2796,6 +2796,7 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; + unsigned OperandIdx[4]; unsigned EnMask = 0; int SrcIdx = 0; @@ -2804,15 +2805,18 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { // Add the register arguments if (Op.isReg()) { - EnMask |= (1 << SrcIdx); + assert(SrcIdx < 4); + OperandIdx[SrcIdx] = Inst.size(); Op.addRegOperands(Inst, 1); ++SrcIdx; continue; } if (Op.isOff()) { - ++SrcIdx; + assert(SrcIdx < 4); + OperandIdx[SrcIdx] = Inst.size(); Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister)); + ++SrcIdx; continue; } @@ -2828,6 +2832,22 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { OptionalIdx[Op.getImmTy()] = i; } + assert(SrcIdx == 4); + + bool Compr = false; + if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) { + Compr = true; + Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]); + Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister); + Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister); + } + + for (auto i = 0; i < SrcIdx; ++i) { + if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) { + EnMask |= Compr? (0x3 << i * 2) : (0x1 << i); + } + } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr); @@ -3642,6 +3662,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, + {"compr", AMDGPUOperand::ImmTyExpCompr, true, nullptr }, {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr}, {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr}, {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr}, diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 89eddb9ce961..2aca65ac8430 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; -def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen">; -def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [], 20>; +def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>; +def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4fb03b62bba9..137b5cca96ce 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -126,6 +126,7 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, assert(MI.getOpcode() == 0); assert(MI.getNumOperands() == 0); MCInst TmpInst; + HasLiteral = false; const auto SavedBytes = Bytes; if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { MI = TmpInst; @@ -343,10 +344,15 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants // ToDo: deal with float/double constants - if (Bytes.size() < 4) - return errOperand(0, "cannot read literal, inst bytes left " + - Twine(Bytes.size())); - return MCOperand::createImm(eatBytes<uint32_t>(Bytes)); + if (!HasLiteral) { + if (Bytes.size() < 4) { + return errOperand(0, "cannot read literal, inst bytes left " + + Twine(Bytes.size())); + } + HasLiteral = true; + Literal = eatBytes<uint32_t>(Bytes); + } + return MCOperand::createImm(Literal); } MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index d50665187e10..620bae0a6d1a 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -39,6 +39,8 @@ class Twine; class AMDGPUDisassembler : public MCDisassembler { private: mutable ArrayRef<uint8_t> Bytes; + mutable uint32_t Literal; + mutable bool HasLiteral; public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 8066428fe44a..18374dca3f84 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "GCNRegPressure.h" +#include "llvm/CodeGen/RegisterPressure.h" using namespace llvm; @@ -63,15 +64,6 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1, return true; } -static GCNRPTracker::LiveRegSet -stripEmpty(const GCNRPTracker::LiveRegSet &LR) { - GCNRPTracker::LiveRegSet Res; - for (const auto &P : LR) { - if (P.second.any()) - Res.insert(P); - } - return Res; -} #endif /////////////////////////////////////////////////////////////////////////////// @@ -185,6 +177,64 @@ void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { } #endif + +static LaneBitmask getDefRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI) { + assert(MO.isDef() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + // We don't rely on read-undef flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 ? + MRI.getMaxLaneMaskForVReg(MO.getReg()) : + MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); +} + +static LaneBitmask getUsedRegMask(const MachineOperand &MO, + const MachineRegisterInfo &MRI, + const LiveIntervals &LIS) { + assert(MO.isUse() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + if (auto SubReg = MO.getSubReg()) + return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); + + auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg()); + if (MaxMask.getAsInteger() == 1) // cannot have subregs + return MaxMask; + + // For a tentative schedule LIS isn't updated yet but livemask should remain + // the same on any schedule. Subreg defs can be reordered but they all must + // dominate uses anyway. + auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); + return getLiveLaneMask(MO.getReg(), SI, LIS, MRI); +} + +SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + SmallVector<RegisterMaskPair, 8> Res; + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + if (!MO.isUse() || !MO.readsReg()) + continue; + + auto const UsedMask = getUsedRegMask(MO, MRI, LIS); + + auto Reg = MO.getReg(); + auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) { + return RM.RegUnit == Reg; + }); + if (I != Res.end()) + I->LaneMask |= UsedMask; + else + Res.push_back(RegisterMaskPair(Reg, UsedMask)); + } + return Res; +} + /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -222,36 +272,6 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, return LiveRegs; } -LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const { - assert(MO.isDef() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); - - // We don't rely on read-undef flag because in case of tentative schedule - // tracking it isn't set correctly yet. This works correctly however since - // use mask has been tracked before using LIS. - return MO.getSubReg() == 0 ? - MRI->getMaxLaneMaskForVReg(MO.getReg()) : - MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); -} - -LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const { - assert(MO.isUse() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); - - if (auto SubReg = MO.getSubReg()) - return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); - - auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg()); - if (MaxMask.getAsInteger() == 1) // cannot have subregs - return MaxMask; - - // For a tentative schedule LIS isn't updated yet but livemask should remain - // the same on any schedule. Subreg defs can be reordered but they all must - // dominate uses anyway. - auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); - return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI); -} - void GCNUpwardRPTracker::reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy) { MRI = &MI.getParent()->getParent()->getRegInfo(); @@ -272,34 +292,40 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { if (MI.isDebugValue()) return; - // process all defs first to ensure early clobbers are handled correctly - // iterating over operands() to catch implicit defs - for (const auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef() || - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) - continue; + auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI); - auto Reg = MO.getReg(); - auto &LiveMask = LiveRegs[Reg]; - auto PrevMask = LiveMask; - LiveMask &= ~getDefRegMask(MO); - CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + // calc pressure at the MI (defs + uses) + auto AtMIPressure = CurPressure; + for (const auto &U : RegUses) { + auto LiveMask = LiveRegs[U.RegUnit]; + AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI); } + // update max pressure + MaxPressure = max(AtMIPressure, MaxPressure); - // then all uses - for (const auto &MO : MI.uses()) { - if (!MO.isReg() || !MO.readsReg() || - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + for (const auto &MO : MI.defs()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) || + MO.isDead()) continue; auto Reg = MO.getReg(); - auto &LiveMask = LiveRegs[Reg]; + auto I = LiveRegs.find(Reg); + if (I == LiveRegs.end()) + continue; + auto &LiveMask = I->second; auto PrevMask = LiveMask; - LiveMask |= getUsedRegMask(MO); + LiveMask &= ~getDefRegMask(MO, *MRI); CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + if (LiveMask.none()) + LiveRegs.erase(I); } - - MaxPressure = max(MaxPressure, CurPressure); + for (const auto &U : RegUses) { + auto &LiveMask = LiveRegs[U.RegUnit]; + auto PrevMask = LiveMask; + LiveMask |= U.LaneMask; + CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI); + } + assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } bool GCNDownwardRPTracker::reset(const MachineInstr &MI, @@ -368,7 +394,7 @@ void GCNDownwardRPTracker::advanceToNext() { continue; auto &LiveMask = LiveRegs[Reg]; auto PrevMask = LiveMask; - LiveMask |= getDefRegMask(MO); + LiveMask |= getDefRegMask(MO, *MRI); CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); } @@ -430,7 +456,7 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, bool GCNUpwardRPTracker::isValid() const { const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); - const auto TrackedLR = stripEmpty(LiveRegs); + const auto &TrackedLR = LiveRegs; if (!isEqual(LISLR, TrackedLR)) { dbgs() << "\nGCNUpwardRPTracker error: Tracked and" diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index 9875ca6a6d16..5dfe44053e72 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -98,8 +98,6 @@ protected: const MachineInstr *LastTrackedMI = nullptr; mutable const MachineRegisterInfo *MRI = nullptr; GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} - LaneBitmask getDefRegMask(const MachineOperand &MO) const; - LaneBitmask getUsedRegMask(const MachineOperand &MO) const; public: // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index d0aba38f786d..fbe45cb222d9 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -62,7 +62,7 @@ private: const MachineInstr &LatrCFAlu) const; public: - R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } + R600ClauseMergePass() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -208,6 +208,6 @@ StringRef R600ClauseMergePass::getPassName() const { } // end anonymous namespace -llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { - return new R600ClauseMergePass(TM); +llvm::FunctionPass *llvm::createR600ClauseMergePass() { + return new R600ClauseMergePass(); } diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 811b905588b4..09b328765604 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -499,7 +499,7 @@ private: } public: - R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID) {} + R600ControlFlowFinalizer() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { ST = &MF.getSubtarget<R600Subtarget>(); @@ -706,6 +706,6 @@ char R600ControlFlowFinalizer::ID = 0; } // end anonymous namespace -FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { - return new R600ControlFlowFinalizer(TM); +FunctionPass *llvm::createR600ControlFlowFinalizer() { + return new R600ControlFlowFinalizer(); } diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 3e46e6387614..5c30a0734f0d 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -37,7 +37,7 @@ private: unsigned Op); public: - R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), + R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID), TII(nullptr) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -51,8 +51,8 @@ public: char R600ExpandSpecialInstrsPass::ID = 0; -FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { - return new R600ExpandSpecialInstrsPass(TM); +FunctionPass *llvm::createR600ExpandSpecialInstrsPass() { + return new R600ExpandSpecialInstrsPass(); } void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index d90008a550ae..502dd3bce97e 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -124,7 +124,7 @@ private: public: static char ID; - R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), + R600VectorRegMerger() : MachineFunctionPass(ID), TII(nullptr) { } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -396,6 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { return false; } -llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { - return new R600VectorRegMerger(tm); +llvm::FunctionPass *llvm::createR600VectorRegMerger() { + return new R600VectorRegMerger(); } diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 5b6dd1ed128d..3e957126b497 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -36,7 +36,7 @@ class R600Packetizer : public MachineFunctionPass { public: static char ID; - R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + R600Packetizer() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -404,6 +404,6 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { } // end anonymous namespace -llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { - return new R600Packetizer(tm); +llvm::FunctionPass *llvm::createR600Packetizer() { + return new R600Packetizer(); } diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index dfdc602b80cd..7501facb0cba 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { return Reserved; } +// Dummy to not crash RegisterClassInfo. +static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; + +const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( + const MachineFunction *) const { + return &CalleeSavedReg; +} + +unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} + unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; } diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h index 9dfb3106c6cc..f0d9644b02f2 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo { R600RegisterInfo(); BitVector getReservedRegs(const MachineFunction &MF) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 1279f845de0e..97bb0f0c0656 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -189,8 +189,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( // ---- // 13 (+1) unsigned ReservedRegCount = 13; - if (SPReg != AMDGPU::NoRegister) - ++ReservedRegCount; if (AllSGPRs.size() < ReservedRegCount) return std::make_pair(ScratchWaveOffsetReg, SPReg); @@ -208,13 +206,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); MFI->setScratchWaveOffsetReg(Reg); ScratchWaveOffsetReg = Reg; - } else { - if (SPReg == AMDGPU::NoRegister) - break; - - MRI.replaceRegWith(SPReg, Reg); - MFI->setStackPtrOffsetReg(Reg); - SPReg = Reg; break; } } @@ -223,8 +214,8 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( return std::make_pair(ScratchWaveOffsetReg, SPReg); } -void SIFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const { +void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); @@ -424,6 +415,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } } +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + if (MFI->isEntryFunction()) + emitEntryFunctionPrologue(MF, MBB); +} + void SIFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 7ccd02b3c86a..e17adbe27361 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -26,6 +26,8 @@ public: AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} ~SIFrameLowering() override = default; + void emitEntryFunctionPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 286be355bc14..01c1f78e7ca4 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -914,6 +914,55 @@ SDValue SITargetLowering::lowerKernargMemParameter( return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); } +SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, + const SDLoc &SL, SDValue Chain, + const ISD::InputArg &Arg) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (Arg.Flags.isByVal()) { + unsigned Size = Arg.Flags.getByValSize(); + int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); + return DAG.getFrameIndex(FrameIdx, MVT::i32); + } + + unsigned ArgOffset = VA.getLocMemOffset(); + unsigned ArgSize = VA.getValVT().getStoreSize(); + + int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); + SDValue ArgValue; + + // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) + ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; + MVT MemVT = VA.getValVT(); + + switch (VA.getLocInfo()) { + default: + break; + case CCValAssign::BCvt: + MemVT = VA.getLocVT(); + break; + case CCValAssign::SExt: + ExtType = ISD::SEXTLOAD; + break; + case CCValAssign::ZExt: + ExtType = ISD::ZEXTLOAD; + break; + case CCValAssign::AExt: + ExtType = ISD::EXTLOAD; + break; + } + + ArgValue = DAG.getExtLoad( + ExtType, SL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT); + return ArgValue; +} + static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, CallingConv::ID CallConv, ArrayRef<ISD::InputArg> Ins, @@ -1094,10 +1143,12 @@ static void allocateSystemSGPRs(CCState &CCInfo, static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { + SIMachineFunctionInfo &Info, + bool NeedSP) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool HasStackObjects = MFI.hasStackObjects(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1155,6 +1206,15 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } + + if (NeedSP){ + unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + + assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg()); + assert(!TRI.isSubRegister(Info.getScratchRSrcReg(), + Info.getStackPtrOffsetReg())); + } } SDValue SITargetLowering::LowerFormalArguments( @@ -1223,8 +1283,10 @@ SDValue SITargetLowering::LowerFormalArguments( !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); + } else if (IsKernel) { + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { - assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())); + Splits.append(Ins.begin(), Ins.end()); } if (IsEntryFunc) { @@ -1278,11 +1340,14 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Arg); continue; + } else if (!IsEntryFunc && VA.isMemLoc()) { + SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); + InVals.push_back(Val); + if (!Arg.Flags.isByVal()) + Chains.push_back(Val.getValue(1)); + continue; } - if (VA.isMemLoc()) - report_fatal_error("memloc not supported with calling convention"); - assert(VA.isRegLoc() && "Parameter must be in a register!"); unsigned Reg = VA.getLocReg(); @@ -1291,7 +1356,7 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.VT.isVector()) { + if (IsShader && Arg.VT.isVector()) { // Build a vector from the registers Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); @@ -1317,16 +1382,49 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + + // TODO: Could maybe omit SP if only tail calls? + bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects(); + // Start adding system SGPRs. - if (IsEntryFunc) + if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); - - reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP); + } else { + CCInfo.AllocateReg(Info->getScratchRSrcReg()); + CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); + CCInfo.AllocateReg(Info->getFrameOffsetReg()); + + if (NeedSP) { + unsigned StackPtrReg = findFirstFreeSGPR(CCInfo); + CCInfo.AllocateReg(StackPtrReg); + Info->setStackPtrOffsetReg(StackPtrReg); + } + } return Chains.empty() ? Chain : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +// TODO: If return values can't fit in registers, we should return as many as +// possible in registers before passing on stack. +bool SITargetLowering::CanLowerReturn( + CallingConv::ID CallConv, + MachineFunction &MF, bool IsVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + // Replacing returns with sret/stack usage doesn't make sense for shaders. + // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn + // for shaders. Vector types should be explicitly handled by CC. + if (AMDGPU::isEntryFunctionCC(CallConv)) + return true; + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)); +} + SDValue SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1336,11 +1434,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - if (!AMDGPU::isShader(CallConv)) + if (AMDGPU::isKernel(CallConv)) { return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); + } + + bool IsShader = AMDGPU::isShader(CallConv); Info->setIfReturnsVoid(Outs.size() == 0); + bool IsWaveEnd = Info->returnsVoid() && IsShader; SmallVector<ISD::OutputArg, 48> Splits; SmallVector<SDValue, 48> SplitVals; @@ -1349,7 +1451,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, for (unsigned i = 0, e = Outs.size(); i != e; ++i) { const ISD::OutputArg &Out = Outs[i]; - if (Out.VT.isVector()) { + if (IsShader && Out.VT.isVector()) { MVT VT = Out.VT.getVectorElementType(); ISD::OutputArg NewOut = Out; NewOut.Flags.setSplit(); @@ -1380,29 +1482,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, *DAG.getContext()); // Analyze outgoing return values. - AnalyzeReturn(CCInfo, Splits); + CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector<SDValue, 48> RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Add return address for callable functions. + if (!Info->isEntryFunction()) { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + SDValue ReturnAddrReg = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + + // FIXME: Should be able to use a vreg here, but need a way to prevent it + // from being allcoated to a CSR. + + SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), + MVT::i64); + + Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + Flag = Chain.getValue(1); + + RetOps.push_back(PhysReturnAddrReg); + } + // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); + // TODO: Partially return in registers if return values don't fit. SDValue Arg = SplitVals[realRVLocIdx]; // Copied from other backends. switch (VA.getLocInfo()) { - default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + default: + llvm_unreachable("Unknown loc info!"); } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); @@ -1410,12 +1541,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + // FIXME: Does sret work properly? + // Update chain and glue. RetOps[0] = Chain; if (Flag.getNode()) RetOps.push_back(Flag); - unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG; + unsigned Opc = AMDGPUISD::ENDPGM; + if (!IsWaveEnd) + Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -2660,6 +2795,15 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + + // Make sure we we do any optimizations that will make it easier to fold + // source modifiers before obscuring it with bit operations. + + // XXX - Why doesn't this get called when vector_shuffle is expanded? + if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) + return Combined; + if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 046e677756d1..e68837747491 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { uint64_t Offset, bool Signed, const ISD::InputArg *Arg = nullptr) const; + SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, + const SDLoc &SL, SDValue Chain, + const ISD::InputArg &Arg) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, @@ -177,7 +181,12 @@ public: const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + bool CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index b83a1fe187eb..02c9b4b1f0ee 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -228,10 +228,10 @@ class EXPe : Enc64 { bits<1> compr; bits<1> done; bits<1> vm; - bits<8> vsrc0; - bits<8> vsrc1; - bits<8> vsrc2; - bits<8> vsrc3; + bits<8> src0; + bits<8> src1; + bits<8> src2; + bits<8> src3; let Inst{3-0} = en; let Inst{9-4} = tgt; @@ -239,10 +239,10 @@ class EXPe : Enc64 { let Inst{11} = done; let Inst{12} = vm; let Inst{31-26} = 0x3e; - let Inst{39-32} = vsrc0; - let Inst{47-40} = vsrc1; - let Inst{55-48} = vsrc2; - let Inst{63-56} = vsrc3; + let Inst{39-32} = src0; + let Inst{47-40} = src1; + let Inst{55-48} = src2; + let Inst{63-56} = src3; } let Uses = [EXEC] in { diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 933a16646746..c6ad61a325cc 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -97,9 +97,7 @@ private: public: static char ID; - SILoadStoreOptimizer() : MachineFunctionPass(ID) {} - - SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { + SILoadStoreOptimizer() : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } @@ -129,8 +127,8 @@ char SILoadStoreOptimizer::ID = 0; char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; -FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { - return new SILoadStoreOptimizer(TM); +FunctionPass *llvm::createSILoadStoreOptimizerPass() { + return new SILoadStoreOptimizer(); } static void moveInstsAfter(MachineBasicBlock::iterator I, diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index adebb8c4a1c5..18b197ddb7ae 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -80,17 +80,22 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); WavesPerEU = ST.getWavesPerEU(*F); - // Non-entry functions have no special inputs for now. - // TODO: Return early for non-entry CCs. + if (!isEntryFunction()) { + // Non-entry functions have no special inputs for now, other registers + // required for scratch access. + ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + ScratchWaveOffsetReg = AMDGPU::SGPR4; + FrameOffsetReg = AMDGPU::SGPR5; + return; + } CallingConv::ID CC = F->getCallingConv(); - if (CC == CallingConv::AMDGPU_PS) - PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); - - if (AMDGPU::isKernel(CC)) { + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; + } else if (CC == CallingConv::AMDGPU_PS) { + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); } if (ST.debuggerEmitPrologue()) { @@ -120,7 +125,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool MaySpill = ST.isVGPRSpillingEnabled(*F); - bool HasStackObjects = FrameInfo.hasStackObjects(); + bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls(); if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index dc9f509e60ae..348bb4fa0260 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -388,9 +388,8 @@ public: void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != AMDGPU::NoRegister && "Should never be unset"); ScratchWaveOffsetReg = Reg; - - // FIXME: Only for entry functions. - FrameOffsetReg = ScratchWaveOffsetReg; + if (isEntryFunction()) + FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index e02c2e3240e8..4dc090d9b7ed 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include <unordered_map> +#include <unordered_set> using namespace llvm; @@ -44,26 +45,29 @@ namespace { class SDWAOperand; class SIPeepholeSDWA : public MachineFunctionPass { +public: + typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector; + private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const SIInstrInfo *TII; std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; + std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; Optional<int64_t> foldToImm(const MachineOperand &Op) const; public: static char ID; - typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector; - SIPeepholeSDWA() : MachineFunctionPass(ID) { initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); + bool isConvertibleToSDWA(const MachineInstr &MI) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); StringRef getPassName() const override { return "SI Peephole SDWA"; } @@ -468,7 +472,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { auto SDWADst = - make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); + make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); SDWAOperands[&MI] = std::move(SDWADst); ++NumSDWAPatternsFound; @@ -575,8 +579,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { } } -bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, - const SDWAOperandsVector &SDWAOperands) { +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const { // Check if this instruction can be converted to SDWA: // 1. Does this opcode support SDWA if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) @@ -588,6 +591,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, return false; } + return true; +} + +bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { // Convert to sdwa int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); assert(SDWAOpcode != -1); @@ -664,7 +672,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, // Apply all sdwa operand pattenrs bool Converted = false; for (auto &Operand : SDWAOperands) { - Converted |= Operand->convertToSDWA(*SDWAInst, TII); + // There should be no intesection between SDWA operands and potential MIs + // e.g.: + // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 + // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 + // v_add_u32 v3, v4, v2 + // + // In that example it is possible that we would fold 2nd instruction into 3rd + // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was + // already destroyed). So if SDWAOperand is also a potential MI then do not + // apply it. + if (PotentialMatches.count(Operand->getParentInst()) == 0) + Converted |= Operand->convertToSDWA(*SDWAInst, TII); } if (!Converted) { SDWAInst->eraseFromParent(); @@ -690,16 +709,15 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - - std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches; - + + // Find all SDWA operands in MF. matchSDWAOperands(MF); - for (auto &OperandPair : SDWAOperands) { - auto &Operand = OperandPair.second; + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI) { - PotentialMatches[PotentialMI].push_back(std::move(Operand)); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) { + PotentialMatches[PotentialMI].push_back(Operand.get()); } } @@ -708,6 +726,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { convertToSDWA(PotentialMI, PotentialPair.second); } + PotentialMatches.clear(); SDWAOperands.clear(); return false; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 06cfc95be96a..6fb01a09fe13 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -117,11 +117,7 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const { - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - unsigned RegCount = ST.getMaxNumSGPRs(MF); +static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned Reg; // Try to place it in a hole after PrivateSegmentBufferReg. @@ -134,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( // wave offset before it. Reg = RegCount - 5; } + + return Reg; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } +unsigned SIRegisterInfo::reservedStackPtrOffsetReg( + const MachineFunction &MF) const { + return AMDGPU::SGPR32; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -198,15 +207,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); + if (StackPtrReg != AMDGPU::NoRegister) { + reserveRegisterTuples(Reserved, StackPtrReg); + assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); + } + + unsigned FrameReg = MFI->getFrameOffsetReg(); + if (FrameReg != AMDGPU::NoRegister) { + reserveRegisterTuples(Reserved, FrameReg); + assert(!isSubRegister(ScratchRSrcReg, FrameReg)); + } + return Reserved; } bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo().hasStackObjects(); + const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>(); + if (Info->isEntryFunction()) { + const MachineFrameInfo &MFI = Fn.getFrameInfo(); + return MFI.hasStackObjects() || MFI.hasCalls(); + } + + // May need scavenger for dealing with callee saved registers. + return true; } -bool -SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { +bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { return MF.getFrameInfo().hasStackObjects(); } @@ -318,8 +345,11 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); assert(FIOp && FIOp->isFI() && "frame index must be address operand"); - assert(TII->isMUBUF(MI)); + assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() == + MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() && + "should only be seeing frame offset relative FrameIndex"); + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); int64_t NewOffset = OffsetOp->getImm() + Offset; @@ -981,12 +1011,72 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } default: { - if (TII->isMUBUF(*MI)) { + const DebugLoc &DL = MI->getDebugLoc(); + bool IsMUBUF = TII->isMUBUF(*MI); + + if (!IsMUBUF && + MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) { + // Convert to an absolute stack address by finding the offset from the + // scratch wave base and scaling by the wave size. + // + // In an entry function/kernel the stack address is already the absolute + // address relative to the the scratch wave offset. + + unsigned DiffReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; + unsigned ResultReg = IsCopy ? + MI->getOperand(0).getReg() : + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) + .addReg(MFI->getFrameOffsetReg()) + .addReg(MFI->getScratchWaveOffsetReg()); + + int64_t Offset = FrameInfo.getObjectOffset(Index); + if (Offset == 0) { + // XXX - This never happens because of emergency scavenging slot at 0? + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) + .addImm(Log2_32(ST.getWavefrontSize())) + .addReg(DiffReg); + } else { + unsigned CarryOut + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned ScaledReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // XXX - Should this use a vector shift? + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) + .addReg(DiffReg, RegState::Kill) + .addImm(Log2_32(ST.getWavefrontSize())); + + // TODO: Fold if use instruction is another add of a constant. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) + .addReg(CarryOut, RegState::Define | RegState::Dead) + .addImm(Offset) + .addReg(ScaledReg, RegState::Kill); + + MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC); + } + + // Don't introduce an extra copy if we're just materializing in a mov. + if (IsCopy) + MI->eraseFromParent(); + else + FIOp.ChangeToRegister(ResultReg, false, false, true); + return; + } + + if (IsMUBUF) { // Disable offen so we don't need a 0 vgpr base. assert(static_cast<int>(FIOperandNum) == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); + assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() + == MFI->getFrameOffsetReg()); + int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); @@ -995,17 +1085,19 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (isUInt<12>(NewOffset) && buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); - break; + return; } } + // If the offset is simply too big, don't convert to a scratch wave offset + // relative index. + int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 679ed229758a..b91cdddc5520 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -17,6 +17,7 @@ #include "AMDGPURegisterInfo.h" #include "SIDefines.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -57,8 +58,16 @@ public: unsigned reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const; + unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + + unsigned getFrameRegister(const MachineFunction &MF) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; @@ -228,6 +237,11 @@ public: const int *getRegUnitPressureSets(unsigned RegUnit) const override; + unsigned getReturnAddressReg(const MachineFunction &MF) const { + // Not a callee saved register. + return AMDGPU::SGPR30_SGPR31; + } + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 593439c2a3cd..f2d8b6f7b7a4 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -186,11 +186,23 @@ def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; -let isTerminator = 1, isBarrier = 1, - isBranch = 1, isIndirectBranch = 1 in { +let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { + +let isBranch = 1, isIndirectBranch = 1 in { def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; +} // End isBranch = 1, isIndirectBranch = 1 + +let isReturn = 1 in { +// Define variant marked as return rather than branch. +def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>; +} +} // End isTerminator = 1, isBarrier = 1 + +let isCall = 1 in { +def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64" +>; } -def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">; + def S_RFE_B64 : SOP1_1 <"s_rfe_b64">; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index d565c84bfeda..2abd4afad3b6 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -518,7 +518,18 @@ bool isCompute(CallingConv::ID cc) { } bool isEntryFunctionCC(CallingConv::ID CC) { - return true; + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + return true; + default: + return false; + } } bool isSI(const MCSubtargetInfo &STI) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d6c836eb748b..8e74aa2cc9a8 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -262,7 +262,6 @@ bool isEntryFunctionCC(CallingConv::ID CC); LLVM_READNONE inline bool isKernel(CallingConv::ID CC) { switch (CC) { - case CallingConv::C: case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: return true; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 5583d6148b08..1979cbf50125 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -471,7 +471,7 @@ void ARMPassConfig::addIRPasses() { if (TM->Options.ThreadModel == ThreadModel::Single) addPass(createLowerAtomicPass()); else - addPass(createAtomicExpandPass(TM)); + addPass(createAtomicExpandPass()); // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in @@ -486,7 +486,7 @@ void ARMPassConfig::addIRPasses() { // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createInterleavedAccessPass(TM)); + addPass(createInterleavedAccessPass()); } bool ARMPassConfig::addPreISel() { diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index e4df7ff5c200..e6ea67d55b43 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -399,7 +399,7 @@ void Simplifier::Context::cleanup() { for (Value *V : Clones) { Instruction *U = cast<Instruction>(V); if (!U->getParent()) - delete U; + U->deleteValue(); } } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 6913d50bbcaa..8e93df6201ae 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -252,7 +252,7 @@ void HexagonPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); bool NoOpt = (getOptLevel() == CodeGenOpt::None); - addPass(createAtomicExpandPass(TM)); + addPass(createAtomicExpandPass()); if (!NoOpt) { if (EnableLoopPrefetch) addPass(createLoopDataPrefetchPass()); diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h index 7553f3972f5d..008b9505ee26 100644 --- a/lib/Target/Mips/Mips.h +++ b/lib/Target/Mips/Mips.h @@ -23,14 +23,14 @@ namespace llvm { class ModulePass; class FunctionPass; - ModulePass *createMipsOs16Pass(MipsTargetMachine &TM); - ModulePass *createMips16HardFloatPass(MipsTargetMachine &TM); + ModulePass *createMipsOs16Pass(); + ModulePass *createMips16HardFloatPass(); - FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM); - FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM); - FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM); + FunctionPass *createMipsModuleISelDagPass(); + FunctionPass *createMipsOptimizePICCallPass(); + FunctionPass *createMipsDelaySlotFillerPass(); FunctionPass *createMipsHazardSchedule(); - FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM); + FunctionPass *createMipsLongBranchPass(); FunctionPass *createMipsConstantIslandPass(); FunctionPass *createMicroMipsSizeReductionPass(); } // end namespace llvm; diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp index 5a394fe02f16..3c2426129e49 100644 --- a/lib/Target/Mips/Mips16HardFloat.cpp +++ b/lib/Target/Mips/Mips16HardFloat.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "MipsTargetMachine.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Support/Debug.h" @@ -28,14 +29,16 @@ namespace { public: static char ID; - Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {} + Mips16HardFloat() : ModulePass(ID) {} StringRef getPassName() const override { return "MIPS16 Hard Float Pass"; } - bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + ModulePass::getAnalysisUsage(AU); + } - protected: - const MipsTargetMachine &TM; + bool runOnModule(Module &M) override; }; static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) { @@ -520,6 +523,8 @@ static void removeUseSoftFloat(Function &F) { // during call lowering but it should be moved here in the future. // bool Mips16HardFloat::runOnModule(Module &M) { + auto &TM = static_cast<const MipsTargetMachine &>( + getAnalysis<TargetPassConfig>().getTM<TargetMachine>()); DEBUG(errs() << "Run on Module Mips16HardFloat\n"); bool Modified = false; for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { @@ -541,6 +546,6 @@ bool Mips16HardFloat::runOnModule(Module &M) { } -ModulePass *llvm::createMips16HardFloatPass(MipsTargetMachine &TM) { - return new Mips16HardFloat(TM); +ModulePass *llvm::createMips16HardFloatPass() { + return new Mips16HardFloat(); } diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp index 1597057ad63f..5d82571ff94f 100644 --- a/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -211,12 +211,12 @@ namespace { class Filler : public MachineFunctionPass { public: - Filler(TargetMachine &tm) - : MachineFunctionPass(ID), TM(tm) { } + Filler() : MachineFunctionPass(ID), TM(nullptr) {} StringRef getPassName() const override { return "Mips Delay Slot Filler"; } bool runOnMachineFunction(MachineFunction &F) override { + TM = &F.getTarget(); bool Changed = false; for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) @@ -290,7 +290,7 @@ namespace { bool terminateSearch(const MachineInstr &Candidate) const; - TargetMachine &TM; + const TargetMachine *TM; static char ID; }; @@ -610,7 +610,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { Changed = true; // Delay slot filling is disabled at -O0. - if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) { + if (!DisableDelaySlotFiller && (TM->getOptLevel() != CodeGenOpt::None)) { bool Filled = false; if (MipsCompactBranchPolicy.getValue() != CB_Always || @@ -910,6 +910,4 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const { /// createMipsDelaySlotFillerPass - Returns a pass that fills in delay /// slots in Mips MachineFunctions -FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) { - return new Filler(tm); -} +FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new Filler(); } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 78bae6954c3c..3641a70d61b5 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -795,7 +795,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, SDValue And0 = N->getOperand(0), And1 = N->getOperand(1); uint64_t SMPos0, SMSize0, SMPos1, SMSize1; - ConstantSDNode *CN; + ConstantSDNode *CN, *CN1; // See if Op's first operand matches (and $src1 , mask0). if (And0.getOpcode() != ISD::AND) @@ -806,37 +806,74 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // See if Op's second operand matches (and (shl $src, pos), mask1). - if (And1.getOpcode() != ISD::AND) - return SDValue(); + if (And1.getOpcode() == ISD::AND && + And1.getOperand(0).getOpcode() == ISD::SHL) { - if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) || - !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1)) - return SDValue(); + if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) || + !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1)) + return SDValue(); - // The shift masks must have the same position and size. - if (SMPos0 != SMPos1 || SMSize0 != SMSize1) - return SDValue(); + // The shift masks must have the same position and size. + if (SMPos0 != SMPos1 || SMSize0 != SMSize1) + return SDValue(); - SDValue Shl = And1.getOperand(0); - if (Shl.getOpcode() != ISD::SHL) - return SDValue(); + SDValue Shl = And1.getOperand(0); - if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1)))) - return SDValue(); + if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1)))) + return SDValue(); - unsigned Shamt = CN->getZExtValue(); + unsigned Shamt = CN->getZExtValue(); - // Return if the shift amount and the first bit position of mask are not the - // same. - EVT ValTy = N->getValueType(0); - if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits())) - return SDValue(); + // Return if the shift amount and the first bit position of mask are not the + // same. + EVT ValTy = N->getValueType(0); + if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits())) + return SDValue(); - SDLoc DL(N); - return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0), - DAG.getConstant(SMPos0, DL, MVT::i32), - DAG.getConstant(SMSize0, DL, MVT::i32), - And0.getOperand(0)); + SDLoc DL(N); + return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0), + DAG.getConstant(SMPos0, DL, MVT::i32), + DAG.getConstant(SMSize0, DL, MVT::i32), + And0.getOperand(0)); + } else { + // Pattern match DINS. + // $dst = or (and $src, mask0), mask1 + // where mask0 = ((1 << SMSize0) -1) << SMPos0 + // => dins $dst, $src, pos, size + if (~CN->getSExtValue() == ((((int64_t)1 << SMSize0) - 1) << SMPos0) && + ((SMSize0 + SMPos0 <= 64 && Subtarget.hasMips64r2()) || + (SMSize0 + SMPos0 <= 32))) { + // Check if AND instruction has constant as argument + bool isConstCase = And1.getOpcode() != ISD::AND; + if (And1.getOpcode() == ISD::AND) { + if (!(CN1 = dyn_cast<ConstantSDNode>(And1->getOperand(1)))) + return SDValue(); + } else { + if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1)))) + return SDValue(); + } + SDLoc DL(N); + EVT ValTy = N->getOperand(0)->getValueType(0); + SDValue Const1; + SDValue SrlX; + if (!isConstCase) { + Const1 = DAG.getConstant(SMPos0, DL, MVT::i32); + SrlX = DAG.getNode(ISD::SRL, DL, And1->getValueType(0), And1, Const1); + } + return DAG.getNode( + MipsISD::Ins, DL, N->getValueType(0), + isConstCase + ? DAG.getConstant(CN1->getSExtValue() >> SMPos0, DL, ValTy) + : SrlX, + DAG.getConstant(SMPos0, DL, MVT::i32), + DAG.getConstant(ValTy.getSizeInBits() / 8 < 8 ? SMSize0 & 31 + : SMSize0, + DL, MVT::i32), + And0->getOperand(0)); + + } + return SDValue(); + } } static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG, diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp index 100503700a72..b95f1158fa56 100644 --- a/lib/Target/Mips/MipsLongBranch.cpp +++ b/lib/Target/Mips/MipsLongBranch.cpp @@ -75,9 +75,8 @@ namespace { public: static char ID; - MipsLongBranch(TargetMachine &tm) - : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()), - ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {} + MipsLongBranch() + : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {} StringRef getPassName() const override { return "Mips Long Branch"; } @@ -96,7 +95,6 @@ namespace { MachineBasicBlock *MBBOpnd); void expandToLongBranch(MBBInfo &Info); - const TargetMachine &TM; MachineFunction *MF; SmallVector<MBBInfo, 16> MBBInfos; bool IsPIC; @@ -469,6 +467,12 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) { static_cast<const MipsSubtarget &>(F.getSubtarget()); const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(STI.getInstrInfo()); + + + const TargetMachine& TM = F.getTarget(); + IsPIC = TM.isPositionIndependent(); + ABI = static_cast<const MipsTargetMachine &>(TM).getABI(); + LongBranchSeqSize = !IsPIC ? 2 : (ABI.IsN64() ? 10 : (!STI.isTargetNaCl() ? 9 : 10)); @@ -541,6 +545,4 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) { /// createMipsLongBranchPass - Returns a pass that converts branches to long /// branches. -FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) { - return new MipsLongBranch(tm); -} +FunctionPass *llvm::createMipsLongBranchPass() { return new MipsLongBranch(); } diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp index cf85eb3f2416..ceacaa498389 100644 --- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp @@ -10,6 +10,7 @@ #include "Mips.h" #include "MipsTargetMachine.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -22,18 +23,19 @@ namespace { public: static char ID; - explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_) - : MachineFunctionPass(ID), TM(TM_) {} + MipsModuleDAGToDAGISel() : MachineFunctionPass(ID) {} // Pass Name StringRef getPassName() const override { return "MIPS DAG->DAG Pattern Instruction Selection"; } - bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + MachineFunctionPass::getAnalysisUsage(AU); + } - protected: - MipsTargetMachine &TM; + bool runOnMachineFunction(MachineFunction &MF) override; }; char MipsModuleDAGToDAGISel::ID = 0; @@ -41,10 +43,12 @@ namespace { bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n"); + auto &TPC = getAnalysis<TargetPassConfig>(); + auto &TM = TPC.getTM<MipsTargetMachine>(); TM.resetSubtarget(&MF); return false; } -llvm::FunctionPass *llvm::createMipsModuleISelDagPass(MipsTargetMachine &TM) { - return new MipsModuleDAGToDAGISel(TM); +llvm::FunctionPass *llvm::createMipsModuleISelDagPass() { + return new MipsModuleDAGToDAGISel(); } diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index f8d9c34556bc..94a1965f9ffb 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -59,7 +59,7 @@ private: class OptimizePICCall : public MachineFunctionPass { public: - OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {} + OptimizePICCall() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Mips OptimizePICCall"; } @@ -297,6 +297,6 @@ void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) { } /// Return an OptimizeCall object. -FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) { - return new OptimizePICCall(TM); +FunctionPass *llvm::createMipsOptimizePICCallPass() { + return new OptimizePICCall(); } diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp index 670b6c96e78e..70ead5cde6fa 100644 --- a/lib/Target/Mips/MipsOs16.cpp +++ b/lib/Target/Mips/MipsOs16.cpp @@ -155,6 +155,4 @@ bool MipsOs16::runOnModule(Module &M) { return modified; } -ModulePass *llvm::createMipsOs16Pass(MipsTargetMachine &TM) { - return new MipsOs16; -} +ModulePass *llvm::createMipsOs16Pass() { return new MipsOs16(); } diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 29a38fd35c1f..092de216e9b8 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -154,6 +154,11 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const { bool hasNoMips16Attr = !F.getFnAttribute("nomips16").hasAttribute(Attribute::None); + bool HasMicroMipsAttr = + !F.getFnAttribute("micromips").hasAttribute(Attribute::None); + bool HasNoMicroMipsAttr = + !F.getFnAttribute("nomicromips").hasAttribute(Attribute::None); + // FIXME: This is related to the code below to reset the target options, // we need to know whether or not the soft float flag is set on the // function, so we can enable it as a subtarget feature. @@ -165,6 +170,10 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const { FS += FS.empty() ? "+mips16" : ",+mips16"; else if (hasNoMips16Attr) FS += FS.empty() ? "-mips16" : ",-mips16"; + if (HasMicroMipsAttr) + FS += FS.empty() ? "+micromips" : ",+micromips"; + else if (HasNoMicroMipsAttr) + FS += FS.empty() ? "-micromips" : ",-micromips"; if (softFloat) FS += FS.empty() ? "+soft-float" : ",+soft-float"; @@ -223,23 +232,23 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) { void MipsPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); - addPass(createAtomicExpandPass(&getMipsTargetMachine())); + addPass(createAtomicExpandPass()); if (getMipsSubtarget().os16()) - addPass(createMipsOs16Pass(getMipsTargetMachine())); + addPass(createMipsOs16Pass()); if (getMipsSubtarget().inMips16HardFloat()) - addPass(createMips16HardFloatPass(getMipsTargetMachine())); + addPass(createMips16HardFloatPass()); } // Install an instruction selector pass using // the ISelDag to gen Mips code. bool MipsPassConfig::addInstSelector() { - addPass(createMipsModuleISelDagPass(getMipsTargetMachine())); + addPass(createMipsModuleISelDagPass()); addPass(createMips16ISelDag(getMipsTargetMachine(), getOptLevel())); addPass(createMipsSEISelDag(getMipsTargetMachine(), getOptLevel())); return false; } void MipsPassConfig::addPreRegAlloc() { - addPass(createMipsOptimizePICCallPass(getMipsTargetMachine())); + addPass(createMipsOptimizePICCallPass()); } TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { @@ -259,15 +268,14 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { // machine code is emitted. return true if -print-machineinstrs should // print out the code after the passes. void MipsPassConfig::addPreEmitPass() { - MipsTargetMachine &TM = getMipsTargetMachine(); addPass(createMicroMipsSizeReductionPass()); // The delay slot filler pass can potientially create forbidden slot (FS) // hazards for MIPSR6 which the hazard schedule pass (HSP) will fix. Any // (new) pass that creates compact branches after the HSP must handle FS // hazards itself or be pipelined before the HSP. - addPass(createMipsDelaySlotFillerPass(TM)); + addPass(createMipsDelaySlotFillerPass()); addPass(createMipsHazardSchedule()); - addPass(createMipsLongBranchPass(TM)); + addPass(createMipsLongBranchPass()); addPass(createMipsConstantIslandPass()); } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 144aea850833..e65b1f1aa0a5 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -689,6 +689,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRA, MVT::v2i64, Legal); setOperationAction(ISD::SRL, MVT::v2i64, Legal); + // 128 bit shifts can be accomplished via 3 instructions for SHL and + // SRL, but not for SRA because of the instructions available: + // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth + // doing + setOperationAction(ISD::SHL, MVT::v1i128, Expand); + setOperationAction(ISD::SRL, MVT::v1i128, Expand); + setOperationAction(ISD::SRA, MVT::v1i128, Expand); + setOperationAction(ISD::SETCC, MVT::v2i64, Legal); } else { @@ -742,6 +750,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasP9Vector()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + // 128 bit shifts can be accomplished via 3 instructions for SHL and + // SRL, but not for SRA because of the instructions available: + // VS{RL} and VS{RL}O. + setOperationAction(ISD::SHL, MVT::v1i128, Legal); + setOperationAction(ISD::SRL, MVT::v1i128, Legal); + setOperationAction(ISD::SRA, MVT::v1i128, Expand); } } diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index e14d18fd5433..5465b5f2d66c 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -987,12 +987,16 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)), (v8i16 (VSLH $vA, $vB))>; def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)), (v4i32 (VSLW $vA, $vB))>; +def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>; def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)), (v16i8 (VSLB $vA, $vB))>; def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)), (v8i16 (VSLH $vA, $vB))>; def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)), (v4i32 (VSLW $vA, $vB))>; +def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>; def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)), (v16i8 (VSRB $vA, $vB))>; @@ -1000,12 +1004,16 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)), (v8i16 (VSRH $vA, $vB))>; def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)), (v4i32 (VSRW $vA, $vB))>; +def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>; def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)), (v16i8 (VSRB $vA, $vB))>; def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)), (v8i16 (VSRH $vA, $vB))>; def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)), (v4i32 (VSRW $vA, $vB))>; +def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)), + (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>; def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)), (v16i8 (VSRAB $vA, $vB))>; diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 3afcec1248d5..46f103141bc1 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1533,6 +1533,8 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case PPC::FCMPUD: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = MI.getOperand(2).getReg(); + Value = 0; + Mask = 0; return true; } } @@ -1591,9 +1593,12 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // We can perform this optimization, equality only, if MI is // zero-extending. + // FIXME: Other possible target instructions include ANDISo and + // RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI. if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo || MIOpC == PPC::SLW || MIOpC == PPC::SLWo || MIOpC == PPC::SRW || MIOpC == PPC::SRWo || + MIOpC == PPC::ANDIo || isZeroExtendingRotate) { noSub = true; equalityOnly = true; @@ -1641,6 +1646,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, break; } + SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate; + SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate; + // There are two possible candidates which can be changed to set CR[01]. // One is MI, the other is a SUB instruction. // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1). @@ -1652,9 +1660,37 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // same BB as the comparison. This is to allow the check below to avoid calls // (and other explicit clobbers); instead we should really check for these // more explicitly (in at least a few predecessors). - else if (MI->getParent() != CmpInstr.getParent() || Value != 0) { - // PPC does not have a record-form SUBri. + else if (MI->getParent() != CmpInstr.getParent()) return false; + else if (Value != 0) { + // The record-form instructions set CR bit based on signed comparison against 0. + // We try to convert a compare against 1 or -1 into a compare against 0. + bool Success = false; + if (!equalityOnly && MRI->hasOneUse(CRReg)) { + MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg); + if (UseMI->getOpcode() == PPC::BCC) { + PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm(); + int16_t Immed = (int16_t)Value; + + if (Immed == -1 && Pred == PPC::PRED_GT) { + // We convert "greater than -1" into "greater than or equal to 0", + // since we are assuming signed comparison by !equalityOnly + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + PPC::PRED_GE)); + Success = true; + } + else if (Immed == 1 && Pred == PPC::PRED_LT) { + // We convert "less than 1" into "less than or equal to 0". + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + PPC::PRED_LE)); + Success = true; + } + } + } + + // PPC does not have a record-form SUBri. + if (!Success) + return false; } // Search for Sub. @@ -1720,15 +1756,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (NewOpC == -1) return false; - SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate; - SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate; - // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP // needs to be updated to be based on SUB. Push the condition code // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the // condition code of these operands will be modified. + // Here, Value == 0 means we haven't converted comparison against 1 or -1 to + // comparison against 0, which may modify predicate. bool ShouldSwap = false; - if (Sub) { + if (Sub && Value == 0) { ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && Sub->getOperand(2).getReg() == SrcReg; @@ -1765,6 +1800,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } else // We need to abort on a user we don't understand. return false; } + assert(!(Value != 0 && ShouldSwap) && + "Non-zero immediate support and ShouldSwap" + "may conflict in updating predicate"); // Create a new virtual register to hold the value of the CR set by the // record-form instruction. If the instruction was not previously in diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 7806d45b5457..ddae5befee3e 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -322,7 +322,7 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { void PPCPassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None) addPass(createPPCBoolRetToIntPass()); - addPass(createAtomicExpandPass(&getPPCTargetMachine())); + addPass(createAtomicExpandPass()); // For the BG/Q (or if explicitly requested), add explicit data prefetch // intrinsics. diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp index 6f9cc314e376..df819ccd15db 100644 --- a/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/lib/Target/Sparc/DelaySlotFiller.cpp @@ -96,7 +96,7 @@ namespace { /// createSparcDelaySlotFillerPass - Returns a pass that fills in delay /// slots in Sparc MachineFunctions /// -FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) { +FunctionPass *llvm::createSparcDelaySlotFillerPass() { return new Filler; } diff --git a/lib/Target/Sparc/LeonPasses.cpp b/lib/Target/Sparc/LeonPasses.cpp index 0acc2875daa8..ca6a0dc3c2a3 100755 --- a/lib/Target/Sparc/LeonPasses.cpp +++ b/lib/Target/Sparc/LeonPasses.cpp @@ -21,9 +21,6 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -LEONMachineFunctionPass::LEONMachineFunctionPass(TargetMachine &tm, char &ID) - : MachineFunctionPass(ID) {} - LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID) : MachineFunctionPass(ID) {} @@ -72,8 +69,7 @@ int LEONMachineFunctionPass::getUnusedFPRegister(MachineRegisterInfo &MRI) { // char InsertNOPLoad::ID = 0; -InsertNOPLoad::InsertNOPLoad(TargetMachine &tm) - : LEONMachineFunctionPass(tm, ID) {} +InsertNOPLoad::InsertNOPLoad() : LEONMachineFunctionPass(ID) {} bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<SparcSubtarget>(); @@ -114,7 +110,7 @@ bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) { // char FixFSMULD::ID = 0; -FixFSMULD::FixFSMULD(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {} +FixFSMULD::FixFSMULD() : LEONMachineFunctionPass(ID) {} bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<SparcSubtarget>(); @@ -203,8 +199,7 @@ bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) { // char ReplaceFMULS::ID = 0; -ReplaceFMULS::ReplaceFMULS(TargetMachine &tm) - : LEONMachineFunctionPass(tm, ID) {} +ReplaceFMULS::ReplaceFMULS() : LEONMachineFunctionPass(ID) {} bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<SparcSubtarget>(); @@ -287,8 +282,7 @@ bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) { char DetectRoundChange::ID = 0; -DetectRoundChange::DetectRoundChange(TargetMachine &tm) - : LEONMachineFunctionPass(tm, ID) {} +DetectRoundChange::DetectRoundChange() : LEONMachineFunctionPass(ID) {} bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<SparcSubtarget>(); @@ -338,8 +332,7 @@ bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) { // char FixAllFDIVSQRT::ID = 0; -FixAllFDIVSQRT::FixAllFDIVSQRT(TargetMachine &tm) - : LEONMachineFunctionPass(tm, ID) {} +FixAllFDIVSQRT::FixAllFDIVSQRT() : LEONMachineFunctionPass(ID) {} bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<SparcSubtarget>(); diff --git a/lib/Target/Sparc/LeonPasses.h b/lib/Target/Sparc/LeonPasses.h index 2158cb636bfc..99cdfc4589ef 100755 --- a/lib/Target/Sparc/LeonPasses.h +++ b/lib/Target/Sparc/LeonPasses.h @@ -32,7 +32,6 @@ protected: std::vector<int> UsedRegisters; protected: - LEONMachineFunctionPass(TargetMachine &tm, char &ID); LEONMachineFunctionPass(char &ID); int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex); @@ -48,7 +47,7 @@ class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass { public: static char ID; - InsertNOPLoad(TargetMachine &tm); + InsertNOPLoad(); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -62,7 +61,7 @@ class LLVM_LIBRARY_VISIBILITY FixFSMULD : public LEONMachineFunctionPass { public: static char ID; - FixFSMULD(TargetMachine &tm); + FixFSMULD(); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -74,7 +73,7 @@ class LLVM_LIBRARY_VISIBILITY ReplaceFMULS : public LEONMachineFunctionPass { public: static char ID; - ReplaceFMULS(TargetMachine &tm); + ReplaceFMULS(); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -89,7 +88,7 @@ class LLVM_LIBRARY_VISIBILITY DetectRoundChange public: static char ID; - DetectRoundChange(TargetMachine &tm); + DetectRoundChange(); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -102,7 +101,7 @@ class LLVM_LIBRARY_VISIBILITY FixAllFDIVSQRT : public LEONMachineFunctionPass { public: static char ID; - FixAllFDIVSQRT(TargetMachine &tm); + FixAllFDIVSQRT(); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h index 0a8272d89297..4135e4e1b61d 100644 --- a/lib/Target/Sparc/Sparc.h +++ b/lib/Target/Sparc/Sparc.h @@ -28,7 +28,7 @@ namespace llvm { class MachineInstr; FunctionPass *createSparcISelDag(SparcTargetMachine &TM); - FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM); + FunctionPass *createSparcDelaySlotFillerPass(); void LowerSparcMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 4ae64062d9e2..1da4d3604304 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -132,7 +132,7 @@ TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) { } void SparcPassConfig::addIRPasses() { - addPass(createAtomicExpandPass(&getSparcTargetMachine())); + addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); } @@ -143,26 +143,26 @@ bool SparcPassConfig::addInstSelector() { } void SparcPassConfig::addPreEmitPass(){ - addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine())); + addPass(createSparcDelaySlotFillerPass()); if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPLoad()) { - addPass(new InsertNOPLoad(getSparcTargetMachine())); + addPass(new InsertNOPLoad()); } if (this->getSparcTargetMachine().getSubtargetImpl()->fixFSMULD()) { - addPass(new FixFSMULD(getSparcTargetMachine())); + addPass(new FixFSMULD()); } if (this->getSparcTargetMachine().getSubtargetImpl()->replaceFMULS()) { - addPass(new ReplaceFMULS(getSparcTargetMachine())); + addPass(new ReplaceFMULS()); } if (this->getSparcTargetMachine().getSubtargetImpl()->detectRoundChange()) { - addPass(new DetectRoundChange(getSparcTargetMachine())); + addPass(new DetectRoundChange()); } if (this->getSparcTargetMachine().getSubtargetImpl()->fixAllFDIVSQRT()) { - addPass(new FixAllFDIVSQRT(getSparcTargetMachine())); + addPass(new FixAllFDIVSQRT()); } } diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 44c794ef5da1..b974681fb6af 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -173,7 +173,7 @@ void WebAssemblyPassConfig::addIRPasses() { else // Expand some atomic operations. WebAssemblyTargetLowering has hooks which // control specifically what gets lowered. - addPass(createAtomicExpandPass(TM)); + addPass(createAtomicExpandPass()); // Fix function bitcasts, as WebAssembly requires caller and callee signatures // to match. diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 784c3a6557ff..3a421fe77392 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat @@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, + FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 6781d761a1c4..7d146d050a5c 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -73,8 +73,8 @@ def CC_#NAME : CallingConv<[ CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>, CCIfByVal<CCPassByVal<4, 4>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // Promote v8i1/v16i1/v32i1 arguments to i32. CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>, @@ -146,8 +146,8 @@ def CC_#NAME : CallingConv<[ ]>; def RetCC_#NAME : CallingConv<[ - // Promote i1, v8i1 arguments to i8. - CCIfType<[i1, v8i1], CCPromoteToType<i8>>, + // Promote i1, v1i1, v8i1 arguments to i8. + CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>, // Promote v16i1 arguments to i16. CCIfType<[v16i1], CCPromoteToType<i16>>, @@ -207,6 +207,7 @@ def RetCC_X86Common : CallingConv<[ // // For code that doesn't care about the ABI, we allow returning more than two // integer values in registers. + CCIfType<[v1i1], CCPromoteToType<i8>>, CCIfType<[i1], CCPromoteToType<i8>>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, @@ -375,6 +376,7 @@ def RetCC_X86_64_Swift : CallingConv<[ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, // For integers, ECX, R8D can be used as extra return registers. + CCIfType<[v1i1], CCPromoteToType<i8>>, CCIfType<[i1], CCPromoteToType<i8>>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>, @@ -485,8 +487,8 @@ def CC_X86_64_C : CallingConv<[ // Handles byval parameters. CCIfByVal<CCPassByVal<8, 8>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, @@ -584,8 +586,8 @@ def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. // FIXME: Handle varargs. - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCAssignToReg<[R10]>>, @@ -796,8 +798,8 @@ def CC_X86_32_Common : CallingConv<[ ]>; def CC_X86_32_C : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, @@ -816,8 +818,8 @@ def CC_X86_32_MCU : CallingConv<[ // puts arguments in registers. CCIfByVal<CCPassByVal<4, 4>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // If the call is not a vararg call, some arguments may be passed // in integer registers. @@ -828,8 +830,8 @@ def CC_X86_32_MCU : CallingConv<[ ]>; def CC_X86_32_FastCall : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest<CCAssignToReg<[EAX]>>, @@ -858,15 +860,15 @@ def CC_X86_32_ThisCall_Common : CallingConv<[ ]>; def CC_X86_32_ThisCall_Mingw : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, CCDelegateTo<CC_X86_32_ThisCall_Common> ]>; def CC_X86_32_ThisCall_Win : CallingConv<[ - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // Pass sret arguments indirectly through stack. CCIfSRet<CCAssignToStack<4, 4>>, @@ -885,8 +887,8 @@ def CC_X86_32_FastCC : CallingConv<[ // puts arguments in registers. CCIfByVal<CCPassByVal<4, 4>>, - // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest<CCAssignToReg<[EAX]>>, diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index fc3b4836c178..3cfb924abd01 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3647,13 +3647,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: - if (Subtarget->hasAVX512()) { - // Need to copy to a VK1 register. - unsigned ResultReg = createResultReg(&X86::VK1RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg); - return ResultReg; - } case MVT::i8: return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, X86::sub_8bit); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 2cd4c1a3e7b3..9f649dad8bc0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -27,20 +27,26 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -#define DEBUG_TYPE "x86-fixup-LEAs" +namespace llvm { +void initializeFixupLEAPassPass(PassRegistry &); +} + +#define FIXUPLEA_DESC "X86 LEA Fixup" +#define FIXUPLEA_NAME "x86-fixup-LEAs" + +#define DEBUG_TYPE FIXUPLEA_NAME STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; + /// \brief Loop over all of the instructions in the basic block /// replacing applicable instructions with LEA instructions, /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + + /// \brief Given a LEA instruction which is unprofitable + /// on SNB+ try to replace it with other instructions. + /// According to Intel's Optimization Reference Manual: + /// " For LEA instructions with three source operands and some specific + /// situations, instruction latency has increased to 3 cycles, and must + /// dispatch via port 1: + /// - LEA that has all three source operands: base, index, and offset + /// - LEA that uses base and index registers where the base is EBP, RBP, + /// or R13 + /// - LEA that uses RIP relative addressing mode + /// - LEA that uses 16-bit addressing mode " + /// This function currently handles the first 2 cases only. + MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg /// and convert them to INC or DEC respectively. bool fixupIncDec(MachineBasicBlock::iterator &I, @@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass { MachineBasicBlock::iterator &MBBI) const; public: - FixupLEAPass() : MachineFunctionPass(ID) {} + static char ID; + + StringRef getPassName() const override { return FIXUPLEA_DESC; } + + FixupLEAPass() : MachineFunctionPass(ID) { + initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); + } /// \brief Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -104,9 +132,12 @@ private: bool OptIncDec; bool OptLEA; }; -char FixupLEAPass::ID = 0; } +char FixupLEAPass::ID = 0; + +INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) + MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { @@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); - OptLEA = ST.LEAusesAG() || ST.slowLEA(); + OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); if (!OptLEA && !OptIncDec) return false; @@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int opcode) { - return opcode == X86::LEA16r || opcode == X86::LEA32r || - opcode == X86::LEA64r || opcode == X86::LEA64_32r; +static inline bool isLEA(const int Opcode) { + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +static inline bool isInefficientLEAReg(unsigned int Reg) { + return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13; +} + +static inline bool isRegOperand(const MachineOperand &Op) { + return Op.isReg() && Op.getReg() != X86::NoRegister; +} +/// hasIneffecientLEARegs - LEA that uses base and index registers +/// where the base is EBP, RBP, or R13 +static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, + const MachineOperand &Index) { + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && + isRegOperand(Index); +} + +static inline bool hasLEAOffset(const MachineOperand &Offset) { + return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); +} + +// LEA instruction that has all three operands: offset, base and index +static inline bool isThreeOperandsLEA(const MachineOperand &Base, + const MachineOperand &Index, + const MachineOperand &Offset) { + return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset); +} + +static inline int getADDrrFromLEA(int LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return X86::ADD16rr; + case X86::LEA32r: + return X86::ADD32rr; + case X86::LEA64_32r: + case X86::LEA64r: + return X86::ADD64rr; + } +} + +static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { + bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; + case X86::LEA32r: + case X86::LEA64_32r: + return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; + case X86::LEA64r: + return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32; + } } /// isLEASimpleIncOrDec - Does this LEA have one these forms: @@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr &MI = *I; - const int opcode = MI.getOpcode(); - if (!isLEA(opcode)) + const int Opcode = MI.getOpcode(); + if (!isLEA(Opcode)) return; if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; if (MI.getOperand(2).getImm() > 1) return; - int addrr_opcode, addri_opcode; - switch (opcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - addrr_opcode = X86::ADD16rr; - addri_opcode = X86::ADD16ri; - break; - case X86::LEA32r: - addrr_opcode = X86::ADD32rr; - addri_opcode = X86::ADD32ri; - break; - case X86::LEA64_32r: - case X86::LEA64r: - addrr_opcode = X86::ADD64rr; - addri_opcode = X86::ADD64ri32; - break; - } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); MachineInstr *NewMI = nullptr; - const MachineOperand &Dst = MI.getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { - const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); - const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) - .add(Dst) - .add(Src1) - .add(Src2); - MFI->insert(I, NewMI); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); + const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = + BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); DEBUG(NewMI->dump();); } // Make ADD instruction for immediate if (MI.getOperand(4).getImm() != 0) { + const MCInstrDesc &ADDri = + TII->get(getADDriFromLEA(Opcode, MI.getOperand(4))); const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) - .add(Dst) + NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(MI.getOperand(4).getImm()); - MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } if (NewMI) { MFI->erase(I); - I = static_cast<MachineBasicBlock::iterator>(NewMI); + I = NewMI; + } +} + +MachineInstr * +FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI) { + + const int LEAOpcode = MI.getOpcode(); + if (!isLEA(LEAOpcode)) + return nullptr; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Base = MI.getOperand(1); + const MachineOperand &Scale = MI.getOperand(2); + const MachineOperand &Index = MI.getOperand(3); + const MachineOperand &Offset = MI.getOperand(4); + const MachineOperand &Segment = MI.getOperand(5); + + if (!(isThreeOperandsLEA(Base, Index, Offset) || + hasInefficientLEABaseReg(Base, Index)) || + !TII->isSafeToClobberEFLAGS(*MFI, MI) || + Segment.getReg() != X86::NoRegister) + return nullptr; + + unsigned int DstR = Dst.getReg(); + unsigned int BaseR = Base.getReg(); + unsigned int IndexR = Index.getReg(); + unsigned SSDstR = + (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; + bool IsScale1 = Scale.getImm() == 1; + bool IsInefficientBase = isInefficientLEAReg(BaseR); + bool IsInefficientIndex = isInefficientLEAReg(IndexR); + + // Skip these cases since it takes more than 2 instructions + // to replace the LEA instruction. + if (IsInefficientBase && SSDstR == BaseR && !IsScale1) + return nullptr; + if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && + (IsInefficientIndex || !IsScale1)) + return nullptr; + + const DebugLoc DL = MI.getDebugLoc(); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); + const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + + // First try to replace LEA with one or two (for the 3-op LEA case) + // add instructions: + // 1.lea (%base,%index,1), %base => add %index,%base + // 2.lea (%base,%index,1), %index => add %base,%index + if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { + const MachineOperand &Src = DstR == BaseR ? Index : Base; + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // Handle the rest of the cases with inefficient base register: + assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(IsInefficientBase && "efficient base should be handled already!"); + + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst + if (IsScale1 && !hasLEAOffset(Offset)) { + TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill()); + DEBUG(MI.getPrevNode()->dump();); + + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + DEBUG(NewMI->dump();); + return NewMI; } + // lea offset(%base,%index,scale), %dst => + // lea offset( ,%index,scale), %dst; add %base,%dst + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); + DEBUG(NewMI->dump();); + + NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + DEBUG(NewMI->dump();); + return NewMI; } bool FixupLEAPass::processBasicBlock(MachineFunction &MF, @@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, if (OptLEA) { if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + + else { + if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { + MFI->erase(I); + I = NewMI; + } + } else + processInstruction(I, MFI); + } } } return false; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 11c08292518a..37b248416e4a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1140,7 +1140,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::i1, &X86::VK1RegClass); + addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); @@ -1155,16 +1155,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SETCC, MVT::i1, Custom); - setOperationAction(ISD::SETCCE, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::XOR, MVT::i1, Legal); - setOperationAction(ISD::OR, MVT::i1, Legal); - setOperationAction(ISD::AND, MVT::i1, Legal); - setOperationAction(ISD::SUB, MVT::i1, Custom); - setOperationAction(ISD::ADD, MVT::i1, Custom); - setOperationAction(ISD::MUL, MVT::i1, Custom); for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, @@ -1233,7 +1223,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); @@ -1311,7 +1300,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); @@ -1699,7 +1690,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { if (!VT.isVector()) - return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; + return MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); @@ -2480,6 +2471,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, SelectionDAG &DAG) { SDValue ValReturned = ValArg; + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); + if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); @@ -2502,7 +2496,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } - return DAG.getBitcast(ValVT, ValReturned); } @@ -2809,8 +2802,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) - : Val; + return ExtendedInMem + ? (VA.getValVT().isVector() + ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) + : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) + : Val; } // FIXME: Get this from tablegen. @@ -2960,7 +2956,7 @@ SDValue X86TargetLowering::LowerFormalArguments( RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) + else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; @@ -6871,7 +6867,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) - Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); @@ -6914,7 +6910,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { if (!isa<ConstantSDNode>(In)) NonConstIdx.push_back(idx); else { - Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) @@ -13946,7 +13942,6 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); - assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); @@ -13980,8 +13975,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -13992,7 +13987,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - if (Op.getSimpleValueType() == MVT::i1) + if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa<ConstantSDNode>(Idx)) { @@ -14163,10 +14158,13 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { return EltInVec; } - // Insertion of one bit into first or last position - // can be done with two SHIFTs + OR. + // Insertion of one bit into first position if (IdxVal == 0 ) { - // EltInVec already at correct index and other bits are 0. + // Clean top bits of vector. + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Clean the first bit in source vector. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); @@ -14175,6 +14173,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } + // Insertion of one bit into last position if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, @@ -17322,8 +17321,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) - && "SetCC type must be 8-bit or 1-bit integer"); + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -17457,7 +17455,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SSECC != 8) { if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0, + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); @@ -17505,9 +17503,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } // AVX512 fallback is to lower selects of scalar floats to masked moves. - if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) && - Subtarget.hasAVX512()) - return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2); + if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { + SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); + } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; @@ -19048,8 +19047,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is coming as MVT::i8 and it should be truncated -/// to MVT::i1 while lowering masking intrinsics. +/// The mask is coming as MVT::i8 and it should be transformed +/// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. @@ -19064,11 +19063,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND) + Op.getOpcode() == X86ISD::FSETCCM_RND) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); @@ -19507,10 +19505,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); - SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask, + DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: case CMP_MASK_CC: { @@ -19570,18 +19569,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask, + DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -19629,13 +19628,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue FCmp; if (isRoundModeCurDirection(Sae)) - FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8)); else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); - // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" - return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); + FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8), Sae); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp, + DAG.getIntPtrConstant(0, dl)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), @@ -23385,8 +23384,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); - EVT EltVT = NVT.getVectorElementType(); - SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { @@ -23404,6 +23401,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); + EVT EltVT = InOp.getOperand(0).getValueType(); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) @@ -24709,16 +24708,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, // xbegin sinkMBB // // mainMBB: - // eax = -1 + // s0 = -1 + // + // fallBB: + // eax = # XABORT_DEF + // s1 = eax // // sinkMBB: - // v = eax + // v = phi(s0/mainBB, s1/fallBB) MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); + MF->insert(I, fallMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. @@ -24726,25 +24731,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned fallDstReg = MRI.createVirtualRegister(RC); + // thisMBB: - // xbegin sinkMBB + // xbegin fallMBB // # fallthrough to mainMBB - // # abortion to sinkMBB - BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + // # abortion to fallMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); thisMBB->addSuccessor(mainMBB); - thisMBB->addSuccessor(sinkMBB); + thisMBB->addSuccessor(fallMBB); // mainMBB: - // EAX = -1 - BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + // mainDstReg := -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); + BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); mainMBB->addSuccessor(sinkMBB); - // sinkMBB: - // EAX is live into the sinkMBB - sinkMBB->addLiveIn(X86::EAX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), - MI.getOperand(0).getReg()) + // fallMBB: + // ; pseudo instruction to model hardware's definition from XABORT + // EAX := XABORT_DEF + // fallDstReg := EAX + BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF)); + BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg) .addReg(X86::EAX); + fallMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(fallDstReg).addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; @@ -29574,7 +29594,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getSelect(DL, VT, CondNew, RHS, LHS); @@ -31321,13 +31341,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { - SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00, - CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); - if (N->getValueType(0) != MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), - FSetCC); - return FSetCC; + SDValue FSetCC = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0), + FSetCC, DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 71d395244b4a..f9344413bbcf 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -31,8 +31,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); // The mask VT. - ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1", - "v" # NumElts # "i1")); + ValueType KVT = !cast<ValueType>("v" # NumElts # "i1"); // Suffix used in the instruction mnemonic. string Suffix = suffix; @@ -2263,7 +2262,7 @@ let Predicates = [HasAVX512, NoDQI] in { let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(i1 (load addr:$src)), + def : Pat<(v1i1 (load addr:$src)), (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>; def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), (KMOVWkm addr:$src)>; @@ -2280,77 +2279,45 @@ let Predicates = [HasBWI] in { } let Predicates = [HasAVX512] in { - def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), - (i32 1)), VK1)>; + multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> { + def : Pat<(maskVT (scalar_to_vector GR32:$src)), + (COPY_TO_REGCLASS GR32:$src, maskRC)>; - def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>; + def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), + (COPY_TO_REGCLASS maskRC:$src, GR32)>; - def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS GR32:$src, VK1)>; + def : Pat<(maskVT (scalar_to_vector GR8:$src)), + (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; - def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit), (i32 1)), VK1)>; + def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; - def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit), (i32 1)), VK1)>; - - def : Pat<(i32 (zext VK1:$src)), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>; - - def : Pat<(i32 (anyext VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, GR32)>; - - def : Pat<(i8 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>; - - def : Pat<(i8 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>; + def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), + (COPY_TO_REGCLASS maskRC:$src, GR32)>; + } - def : Pat<(i64 (zext VK1:$src)), - (SUBREG_TO_REG (i64 0), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>; + defm : operation_gpr_mask_copy_lowering<VK1, v1i1>; + defm : operation_gpr_mask_copy_lowering<VK2, v2i1>; + defm : operation_gpr_mask_copy_lowering<VK4, v4i1>; + defm : operation_gpr_mask_copy_lowering<VK8, v8i1>; + defm : operation_gpr_mask_copy_lowering<VK16, v16i1>; + defm : operation_gpr_mask_copy_lowering<VK32, v32i1>; + defm : operation_gpr_mask_copy_lowering<VK64, v64i1>; - def : Pat<(i64 (anyext VK1:$src)), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>; + def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK1)>; + def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK16)>; + def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), VK8)>; - def : Pat<(i16 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>; - - def : Pat<(i16 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>; -} -def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; -def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -def : Pat<(v4i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK4)>; -def : Pat<(v2i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK2)>; -def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; -def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; - -def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; - -def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>; +} // Mask unary operation // - KNOT @@ -2551,14 +2518,11 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>; def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>; + def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; - let AddedComplexity = 10 in { // To optimize isel table. - def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; - def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - } + def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 @@ -2570,6 +2534,12 @@ multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subV def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), (VT (COPY_TO_REGCLASS subRC:$src, RC))>; } +defm : operation_subvector_mask_lowering<VK1, v1i1, VK2, v2i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK4, v4i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK8, v8i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK16, v16i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK64, v64i1>; defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>; defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>; @@ -3249,7 +3219,7 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode, def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT _.FRC:$src2))))))), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk) @@ -3260,7 +3230,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT ZeroFP))))))), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz) @@ -3279,7 +3249,7 @@ def : Pat<(masked_store addr:$dst, Mask, (iPTR 0))), (iPTR 0)))), (!cast<Instruction>(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3296,7 +3266,7 @@ def : Pat<(masked_store addr:$dst, Mask, (iPTR 0))), (iPTR 0)))), (!cast<Instruction>(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3310,7 +3280,7 @@ def : Pat<(_.info128.VT (extract_subvector (v16i32 immAllZerosV))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3322,7 +3292,7 @@ def : Pat<(_.info128.VT (extract_subvector (iPTR 0))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; } @@ -3338,7 +3308,7 @@ def : Pat<(_.info128.VT (extract_subvector (v16i32 immAllZerosV))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3350,7 +3320,7 @@ def : Pat<(_.info128.VT (extract_subvector (iPTR 0))))), (iPTR 0))), (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; } @@ -3381,7 +3351,7 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), - (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)), + (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 9867ba84bb9b..e2e228f5544b 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -274,7 +274,7 @@ def X86select : SDNode<"X86ISD::SELECT", SDTCisSameNumEltsAs<0, 1>]>>; def X86selects : SDNode<"X86ISD::SELECTS", - SDTypeProfile<1, 3, [SDTCisVT<1, i1>, + SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>>; @@ -441,7 +441,7 @@ def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", SDTCisSameNumEltsAs<0,1>, SDTCisVT<2, i32>]>, []>; def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", - SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", @@ -451,7 +451,7 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 092ceb207ada..f7083a7448ce 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -10511,9 +10511,7 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF, - bool IsTailCall) const { - return; -} + bool IsTailCall) const {} MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 4d7d8ece92d9..01df07e1715f 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -896,9 +896,16 @@ def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" "TM.getCodeModel() == CodeModel::Kernel">; def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; -def OptForSize : Predicate<"Subtarget->getOptForSize()">; -def OptForMinSize : Predicate<"Subtarget->getOptForMinSize()">; -def OptForSpeed : Predicate<"!Subtarget->getOptForSize()">; + +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def OptForSize : Predicate<"MF->getFunction()->optForSize()">; + def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction()->optForSize()">; +} + def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 38ac8be94483..61aac58a491f 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -30,6 +30,11 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), "xbegin\t$dst", []>, OpSize32; } +// Psuedo instruction to fake the definition of EAX on the fallback code path. +let isPseudo = 1, Defs = [EAX] in { +def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>; +} + def XEND : I<0x01, MRM_D5, (outs), (ins), "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 5eb5ad52840a..61956f741820 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -449,24 +449,30 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, if (!SrcRC) return false; - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); - return false; - } - + unsigned SubIdx; if (DstRC == SrcRC) { // Nothing to be done + SubIdx = X86::NoSubRegister; } else if (DstRC == &X86::GR32RegClass) { - I.getOperand(1).setSubReg(X86::sub_32bit); + SubIdx = X86::sub_32bit; } else if (DstRC == &X86::GR16RegClass) { - I.getOperand(1).setSubReg(X86::sub_16bit); + SubIdx = X86::sub_16bit; } else if (DstRC == &X86::GR8RegClass) { - I.getOperand(1).setSubReg(X86::sub_8bit); + SubIdx = X86::sub_8bit; } else { return false; } + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx); + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + I.getOperand(1).setSubReg(SubIdx); + I.setDesc(TII.get(X86::COPY)); return true; } diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 8ce240714f17..da724f5d8989 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -184,6 +184,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { return; const LLT s64 = LLT::scalar(64); + const LLT v16s8 = LLT::vector(16, 8); const LLT v8s16 = LLT::vector(8, 16); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); @@ -193,7 +194,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { setAction({BinOp, Ty}, Legal); for (unsigned BinOp : {G_ADD, G_SUB}) - for (auto Ty : {v4s32}) + for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) setAction({BinOp, Ty}, Legal); setAction({G_MUL, v8s16}, Legal); @@ -212,8 +213,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() { if (!Subtarget.hasAVX2()) return; + const LLT v32s8 = LLT::vector(32, 8); const LLT v16s16 = LLT::vector(16, 16); const LLT v8s32 = LLT::vector(8, 32); + const LLT v4s64 = LLT::vector(4, 64); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) + setAction({BinOp, Ty}, Legal); for (auto Ty : {v16s16, v8s32}) setAction({G_MUL, Ty}, Legal); @@ -224,6 +231,11 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { return; const LLT v16s32 = LLT::vector(16, 32); + const LLT v8s64 = LLT::vector(8, 64); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v16s32, v8s64}) + setAction({BinOp, Ty}, Legal); setAction({G_MUL, v16s32}, Legal); @@ -261,8 +273,13 @@ void X86LegalizerInfo::setLegalizerInfoAVX512BW() { if (!(Subtarget.hasAVX512() && Subtarget.hasBWI())) return; + const LLT v64s8 = LLT::vector(64, 8); const LLT v32s16 = LLT::vector(32, 16); + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v64s8, v32s16}) + setAction({BinOp, Ty}, Legal); + setAction({G_MUL, v32s16}, Legal); /************ VLX *******************/ diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index d235d2b40b15..3a61a7247c72 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -511,7 +511,7 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} @@ -519,7 +519,7 @@ def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index d66d39dcee17..2b1f43bffd71 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -320,6 +320,7 @@ void X86Subtarget::initializeEnvironment() { CallRegIndirect = false; LEAUsesAG = false; SlowLEA = false; + Slow3OpsLEA = false; SlowIncDec = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? @@ -336,8 +337,7 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, - unsigned StackAlignOverride, bool OptForSize, - bool OptForMinSize) + unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), @@ -347,8 +347,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), - FrameLowering(*this, getStackAlignment()), OptForSize(OptForSize), - OptForMinSize(OptForMinSize) { + FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (!isPositionIndependent()) setPICStyle(PICStyles::None); diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index de1514243aeb..a9f3a2aee1be 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -253,6 +253,11 @@ protected: /// True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// True if the LEA instruction has all three source operands: base, index, + /// and offset or if the LEA instruction uses base and index registers where + /// the base is EBP, RBP,or R13 + bool Slow3OpsLEA; + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; @@ -331,16 +336,12 @@ private: X86TargetLowering TLInfo; X86FrameLowering FrameLowering; - bool OptForSize; - bool OptForMinSize; - public: /// This constructor initializes the data members to match that /// of the specified triple. /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, - const X86TargetMachine &TM, unsigned StackAlignOverride, - bool OptForSize, bool OptForMinSize); + const X86TargetMachine &TM, unsigned StackAlignOverride); /// This object will take onwership of \p GISelAccessor. void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); } @@ -490,6 +491,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } @@ -507,9 +509,6 @@ public: bool isSLM() const { return X86ProcFamily == IntelSLM; } bool useSoftFloat() const { return UseSoftFloat; } - bool getOptForSize() const { return OptForSize; } - bool getOptForMinSize() const { return OptForMinSize; } - /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for /// no-sse2). There isn't any reason to disable it if the target processor /// supports it. diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 9a82e6e50463..53a8e83b36fc 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); +void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); } // end namespace llvm @@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); } @@ -268,12 +270,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { FS = Key.substr(CPU.size()); - bool OptForSize = F.optForSize(); - bool OptForMinSize = F.optForMinSize(); - - Key += std::string(OptForSize ? "+" : "-") + "optforsize"; - Key += std::string(OptForMinSize ? "+" : "-") + "optforminsize"; - auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any @@ -281,8 +277,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, - Options.StackAlignmentOverride, - OptForSize, OptForMinSize); + Options.StackAlignmentOverride); #ifndef LLVM_BUILD_GLOBAL_ISEL GISelAccessor *GISel = new GISelAccessor(); #else @@ -378,12 +373,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { } void X86PassConfig::addIRPasses() { - addPass(createAtomicExpandPass(&getX86TargetMachine())); + addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createInterleavedAccessPass(TM)); + addPass(createInterleavedAccessPass()); } bool X86PassConfig::addInstSelector() { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 8566bd91c89e..fe94079fd869 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1392,15 +1392,47 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll + static const CostTblEntry AVX512CDCostTbl[] = { + { ISD::CTLZ, MVT::v8i64, 1 }, + { ISD::CTLZ, MVT::v16i32, 1 }, + { ISD::CTLZ, MVT::v32i16, 8 }, + { ISD::CTLZ, MVT::v64i8, 20 }, + { ISD::CTLZ, MVT::v4i64, 1 }, + { ISD::CTLZ, MVT::v8i32, 1 }, + { ISD::CTLZ, MVT::v16i16, 4 }, + { ISD::CTLZ, MVT::v32i8, 10 }, + { ISD::CTLZ, MVT::v2i64, 1 }, + { ISD::CTLZ, MVT::v4i32, 1 }, + { ISD::CTLZ, MVT::v8i16, 4 }, + { ISD::CTLZ, MVT::v16i8, 4 }, + }; static const CostTblEntry AVX512BWCostTbl[] = { { ISD::BITREVERSE, MVT::v8i64, 5 }, { ISD::BITREVERSE, MVT::v16i32, 5 }, { ISD::BITREVERSE, MVT::v32i16, 5 }, { ISD::BITREVERSE, MVT::v64i8, 5 }, + { ISD::CTLZ, MVT::v8i64, 23 }, + { ISD::CTLZ, MVT::v16i32, 22 }, + { ISD::CTLZ, MVT::v32i16, 18 }, + { ISD::CTLZ, MVT::v64i8, 17 }, + { ISD::CTPOP, MVT::v8i64, 7 }, + { ISD::CTPOP, MVT::v16i32, 11 }, + { ISD::CTPOP, MVT::v32i16, 9 }, + { ISD::CTPOP, MVT::v64i8, 6 }, + { ISD::CTTZ, MVT::v8i64, 10 }, + { ISD::CTTZ, MVT::v16i32, 14 }, + { ISD::CTTZ, MVT::v32i16, 12 }, + { ISD::CTTZ, MVT::v64i8, 9 }, }; static const CostTblEntry AVX512CostTbl[] = { { ISD::BITREVERSE, MVT::v8i64, 36 }, { ISD::BITREVERSE, MVT::v16i32, 24 }, + { ISD::CTLZ, MVT::v8i64, 29 }, + { ISD::CTLZ, MVT::v16i32, 35 }, + { ISD::CTPOP, MVT::v8i64, 16 }, + { ISD::CTPOP, MVT::v16i32, 24 }, + { ISD::CTTZ, MVT::v8i64, 20 }, + { ISD::CTTZ, MVT::v16i32, 28 }, }; static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, @@ -1560,6 +1592,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, MVT MTy = LT.second; // Attempt to lookup cost. + if (ST->hasCDI()) + if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) return LT.first * Entry->Cost; diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index e28e05c7f6a8..2950e2efbea3 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -74,7 +74,7 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) { } void XCorePassConfig::addIRPasses() { - addPass(createAtomicExpandPass(&getXCoreTargetMachine())); + addPass(createAtomicExpandPass()); TargetPassConfig::addIRPasses(); } |