summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-05-22 19:43:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-05-22 19:43:28 +0000
commitb5630dbadf9a2a06754194387d6b0fd9962a67f1 (patch)
tree3fe1e2bc0dc2823ab21f06959fbb3eaca317ea29 /lib/Target/AMDGPU
parent7af96fb3afd6725a2824a0a5ca5dad34e5e0b056 (diff)
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h20
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp15
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td50
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp24
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp120
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp163
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h5
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td2
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp21
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp17
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp31
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp45
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp20
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp25
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td4
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp14
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h2
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp146
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h2
-rw-r--r--lib/Target/AMDGPU/R600ClauseMergePass.cpp6
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp6
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp6
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp6
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp6
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp12
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.h2
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp20
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h2
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp174
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h11
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td16
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp8
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp19
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h5
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp45
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp120
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h14
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td18
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp13
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h1
43 files changed, 922 insertions, 328 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 3f89702bed505..78ff3bbe3d1a0 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -27,12 +27,12 @@ class PassRegistry;
class Module;
// R600 Passes
-FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600VectorRegMerger();
+FunctionPass *createR600ExpandSpecialInstrsPass();
FunctionPass *createR600EmitClauseMarkers();
-FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
-FunctionPass *createR600Packetizer(TargetMachine &tm);
-FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createR600ClauseMergePass();
+FunctionPass *createR600Packetizer();
+FunctionPass *createR600ControlFlowFinalizer();
FunctionPass *createAMDGPUCFGStructurizerPass();
// SI Passes
@@ -42,24 +42,24 @@ FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
-FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
extern char &AMDGPUMachineCFGStructurizerID;
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
-ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr);
+ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
@@ -97,7 +97,7 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 3d8db7cd8af55..7235d8fae3327 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -27,7 +28,6 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public ModulePass {
private:
- const TargetMachine *TM;
AMDGPUAS AS;
static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
@@ -37,8 +37,7 @@ private:
public:
static char ID;
- AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) :
- ModulePass(ID), TM(TM_) {}
+ AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {}
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return "AMDGPU Annotate Kernel Features";
@@ -221,8 +220,10 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
if (F.hasFnAttribute("amdgpu-queue-ptr"))
continue;
- bool HasApertureRegs =
- TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs();
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>()
+ .getSubtarget<AMDGPUSubtarget>(F)
+ .hasApertureRegs();
if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
F.addFnAttr("amdgpu-queue-ptr");
}
@@ -231,6 +232,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
return Changed;
}
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) {
- return new AMDGPUAnnotateKernelFeatures(TM);
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+ return new AMDGPUAnnotateKernelFeatures();
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 09bdf8ffcde7b..251cb7a2c440d 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -38,7 +38,8 @@ class AMDGPUCallLowering: public CallLowering {
unsigned VReg) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<unsigned> VRegs) const override;
- CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+ static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+ static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
};
} // End of namespace llvm;
#endif
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index d308f718aae13..4bef7a89bfe34 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -13,6 +13,8 @@
// Inversion of CCIfInReg
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+class CCIfExtend<CCAction A>
+ : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
// Calling convention for SI
def CC_SI : CallingConv<[
@@ -52,7 +54,7 @@ def CC_SI : CallingConv<[
]>>>
]>;
-def RetCC_SI : CallingConv<[
+def RetCC_SI_Shader : CallingConv<[
CCIfType<[i32] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@@ -99,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[
CCCustom<"allocateKernArg">
]>;
+def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
+ (sequence "VGPR%u", 24, 255)
+>;
+
+def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
+ (sequence "VGPR%u", 32, 255)
+>;
+
+def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
+ (sequence "SGPR%u", 32, 103)
+>;
+
+def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
+ (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
+>;
+
+// Calling convention for leaf functions
+def CC_AMDGPU_Func : CallingConv<[
+ CCIfByVal<CCPassByVal<4, 4>>,
+ CCIfType<[i1], CCPromoteToType<i32>>,
+ CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+ CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+ CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+ CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+// Calling convention for leaf functions
+def RetCC_AMDGPU_Func : CallingConv<[
+ CCIfType<[i1], CCPromoteToType<i32>>,
+ CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+ CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+]>;
+
def CC_AMDGPU : CallingConv<[
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >="
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e19314fe0a6c8..d923cb117c120 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -19,6 +19,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -48,7 +49,6 @@ namespace {
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
- const GCNTargetMachine *TM;
const SISubtarget *ST = nullptr;
DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
@@ -127,8 +127,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
public:
static char ID;
- AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
- FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {}
+ AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
bool visitFDiv(BinaryOperator &I);
@@ -487,10 +486,15 @@ bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
}
bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
- if (!TM || skipFunction(F))
+ if (skipFunction(F))
return false;
- ST = &TM->getSubtarget<SISubtarget>(F);
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ ST = &TM.getSubtarget<SISubtarget>(F);
DA = &getAnalysis<DivergenceAnalysis>();
HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -507,14 +511,14 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
return MadeChange;
}
-INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
-INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
- "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
+ false, false)
char AMDGPUCodeGenPrepare::ID = 0;
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
- return new AMDGPUCodeGenPrepare(TM);
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
+ return new AMDGPUCodeGenPrepare();
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c3ac796a0a442..19fce064783d0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -82,7 +82,7 @@ public:
void PostprocessISelDAG() override;
private:
- SDValue foldFrameIndex(SDValue N) const;
+ std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;
bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
@@ -116,9 +116,11 @@ private:
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
- bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+ bool SelectMUBUFScratchOffen(SDNode *Root,
+ SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
- bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+ bool SelectMUBUFScratchOffset(SDNode *Root,
+ SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
@@ -1074,13 +1076,33 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
}
-SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
- if (auto FI = dyn_cast<FrameIndexSDNode>(N))
- return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
- return N;
+static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
+ auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
+ return PSV && PSV->isStack();
+}
+
+std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+ const MachineFunction &MF = CurDAG->getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (auto FI = dyn_cast<FrameIndexSDNode>(N)) {
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+ FI->getValueType(0));
+
+ // If we can resolve this to a frame index access, this is relative to the
+ // frame pointer SGPR.
+ return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(),
+ MVT::i32));
+ }
+
+ // If we don't know this private access is a local stack object, it needs to
+ // be relative to the entry point's scratch wave offset register.
+ return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
+ MVT::i32));
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
+ SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
@@ -1089,7 +1111,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
unsigned Imm = CAddr->getZExtValue();
@@ -1100,6 +1121,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
+
+ // In a call sequence, stores to the argument stack area are relative to the
+ // stack pointer.
+ const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+ unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
+ Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
+
+ SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
return true;
}
@@ -1113,19 +1142,20 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
// Offsets in vaddr must be positive.
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
if (isLegalMUBUFImmOffset(C1)) {
- VAddr = foldFrameIndex(N0);
+ std::tie(VAddr, SOffset) = foldFrameIndex(N0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;
}
}
// (node)
- VAddr = foldFrameIndex(Addr);
+ std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
+ SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
@@ -1138,7 +1168,15 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr,
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
+
+ const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+ unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
+ Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
+
+ // FIXME: Get from MachinePointerInfo? We should only be using the frame
+ // offset if we know this is in a call sequence.
+ SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
}
@@ -1700,12 +1738,46 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
return true;
}
+static SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+}
+
+// Figure out if this is really an extract of the high 16-bits of a dword.
+static bool isExtractHiElt(SDValue In, SDValue &Out) {
+ In = stripBitcast(In);
+ if (In.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue Srl = In.getOperand(0);
+ if (Srl.getOpcode() == ISD::SRL) {
+ if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ if (ShiftAmt->getZExtValue() == 16) {
+ Out = stripBitcast(Srl.getOperand(0));
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+// Look through operations that obscure just looking at the low 16-bits of the
+// same register.
+static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::TRUNCATE) {
+ SDValue Src = In.getOperand(0);
+ if (Src.getValueType().getSizeInBits() == 32)
+ return stripBitcast(Src);
+ }
+
+ return In;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
Src = In;
- // FIXME: Look for on separate components
if (Src.getOpcode() == ISD::FNEG) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = Src.getOperand(0);
@@ -1714,19 +1786,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
if (Src.getOpcode() == ISD::BUILD_VECTOR) {
unsigned VecMods = Mods;
- SDValue Lo = Src.getOperand(0);
- SDValue Hi = Src.getOperand(1);
+ SDValue Lo = stripBitcast(Src.getOperand(0));
+ SDValue Hi = stripBitcast(Src.getOperand(1));
if (Lo.getOpcode() == ISD::FNEG) {
- Lo = Lo.getOperand(0);
+ Lo = stripBitcast(Lo.getOperand(0));
Mods ^= SISrcMods::NEG;
}
if (Hi.getOpcode() == ISD::FNEG) {
- Hi = Hi.getOperand(0);
+ Hi = stripBitcast(Hi.getOperand(0));
Mods ^= SISrcMods::NEG_HI;
}
+ if (isExtractHiElt(Lo, Lo))
+ Mods |= SISrcMods::OP_SEL_0;
+
+ if (isExtractHiElt(Hi, Hi))
+ Mods |= SISrcMods::OP_SEL_1;
+
+ Lo = stripExtractLoElt(Lo);
+ Hi = stripExtractLoElt(Hi);
+
if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
// Really a scalar input. Just select from the low half of the register to
// avoid packing.
@@ -1740,9 +1821,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
}
// Packed instructions do not have abs modifiers.
-
- // FIXME: Handle abs/neg of individual components.
- // FIXME: Handle swizzling with op_sel
Mods |= SISrcMods::OP_SEL_1;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f80652b873730..5ec46a8294c0c 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -76,6 +76,45 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
}
}
+// Allocate up to VGPR31.
+//
+// TODO: Since there are no VGPR alignent requirements would it be better to
+// split into individual scalar registers?
+static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ switch (LocVT.SimpleTy) {
+ case MVT::i64:
+ case MVT::f64:
+ case MVT::v2i32:
+ case MVT::v2f32: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_64RegClass, 31);
+ }
+ case MVT::v4i32:
+ case MVT::v4f32:
+ case MVT::v2i64:
+ case MVT::v2f64: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_128RegClass, 29);
+ }
+ case MVT::v8i32:
+ case MVT::v8f32: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_256RegClass, 25);
+
+ }
+ case MVT::v16i32:
+ case MVT::v16f32: {
+ return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+ &AMDGPU::VReg_512RegClass, 17);
+
+ }
+ default:
+ return false;
+ }
+}
+
#include "AMDGPUGenCallingConv.inc"
// Find a larger type to do a load / store of a vector with.
@@ -773,8 +812,43 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
//===---------------------------------------------------------------------===//
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
- bool IsVarArg) const {
- return CC_AMDGPU;
+ bool IsVarArg) {
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return CC_AMDGPU_Kernel;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ return CC_AMDGPU;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return CC_AMDGPU_Func;
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ }
+}
+
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
+ bool IsVarArg) {
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return CC_AMDGPU_Kernel;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ return RetCC_SI_Shader;
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return RetCC_AMDGPU_Func;
+ default:
+ report_fatal_error("Unsupported calling convention.");
+ }
}
/// The SelectionDAGBuilder will automatically promote function arguments
@@ -874,18 +948,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
}
}
-void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
- const SmallVectorImpl<ISD::OutputArg> &Outs) const {
-
- State.AnalyzeReturn(Outs, RetCC_SI);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SDLoc &DL, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerReturn(
+ SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ // FIXME: Fails for r600 tests
+ //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
+ // "wave terminate should not have return values");
return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
}
@@ -896,20 +967,12 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) {
- switch (CC) {
- case CallingConv::C:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- return CC_AMDGPU;
- default:
- report_fatal_error("Unsupported calling convention.");
- }
+ return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
+}
+
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+ bool IsVarArg) {
+ return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
}
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -2532,27 +2595,49 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- if (N->getValueType(0) != MVT::i64)
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
return SDValue();
- // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
-
- // On some subtargets, 64-bit shift is a quarter rate instruction. In the
- // common case, splitting this into a move and a 32-bit shift is faster and
- // the same code size.
- const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS)
return SDValue();
- unsigned RHSVal = RHS->getZExtValue();
- if (RHSVal < 32)
- return SDValue();
-
SDValue LHS = N->getOperand(0);
+ unsigned RHSVal = RHS->getZExtValue();
+ if (!RHSVal)
+ return LHS;
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
+ switch (LHS->getOpcode()) {
+ default:
+ break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ANY_EXTEND: {
+ // shl (ext x) => zext (shl x), if shift does not overflow int
+ KnownBits Known;
+ SDValue X = LHS->getOperand(0);
+ DAG.computeKnownBits(X, Known);
+ unsigned LZ = Known.countMinLeadingZeros();
+ if (LZ < RHSVal)
+ break;
+ EVT XVT = X.getValueType();
+ SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
+ return DAG.getZExtOrTrunc(Shl, SL, VT);
+ }
+ }
+
+ // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
+ if (RHSVal < 32)
+ return SDValue();
+
SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4c588a7bafd05..fb2f15022d259 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -115,9 +115,6 @@ protected:
SmallVectorImpl<SDValue> &Results) const;
void analyzeFormalArgumentsCompute(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const;
- void AnalyzeReturn(CCState &State,
- const SmallVectorImpl<ISD::OutputArg> &Outs) const;
-
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
@@ -164,6 +161,8 @@ public:
bool isCheapToSpeculateCtlz() const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+ static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
+
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 353cc57427915..e286558ce60d7 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -380,6 +380,6 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index dcb6670621eef..846e7dff5f8cc 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -9,6 +9,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -25,15 +26,13 @@ const unsigned MaxStaticSize = 1024;
class AMDGPULowerIntrinsics : public ModulePass {
private:
- const TargetMachine *TM;
-
bool makeLIDRangeMetadata(Function &F) const;
public:
static char ID;
- AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr)
- : ModulePass(ID), TM(TM) { }
+ AMDGPULowerIntrinsics() : ModulePass(ID) {}
+
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return "AMDGPU Lower Intrinsics";
@@ -46,8 +45,8 @@ char AMDGPULowerIntrinsics::ID = 0;
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
-INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
- "Lower intrinsics", false, false)
+INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
+ false)
// TODO: Should refine based on estimated number of accesses (e.g. does it
// require splitting based on alignment)
@@ -104,11 +103,13 @@ static bool expandMemIntrinsicUses(Function &F) {
}
bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
- if (!TM)
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
return false;
+ const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
bool Changed = false;
- const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
for (auto *U : F.users()) {
auto *CI = dyn_cast<CallInst>(U);
@@ -155,6 +156,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
return Changed;
}
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) {
- return new AMDGPULowerIntrinsics(TM);
+ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
+ return new AMDGPULowerIntrinsics();
}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index da247fea7de6e..f1ef6281c90fb 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -126,9 +126,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
}
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+ unsigned Opcode = MI->getOpcode();
- int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+ // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
+ // need to select it to the subtarget specific version, and there's no way to
+ // do that with a single pseudo source operation.
+ if (Opcode == AMDGPU::S_SETPC_B64_return)
+ Opcode = AMDGPU::S_SETPC_B64;
+ int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index fe7283ccf7d91..9fb7f5f889271 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -12,21 +12,6 @@
using namespace llvm;
-static bool isEntryFunctionCC(CallingConv::ID CC) {
- switch (CC) {
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- return true;
- default:
- return false;
- }
-}
-
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
LocalMemoryObjects(),
@@ -34,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MaxKernArgAlign(0),
LDSSize(0),
ABIArgOffset(0),
- IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+ IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
// except reserved size is not correctly aligned.
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e40f395577471..85184b363905e 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -99,8 +100,7 @@ private:
public:
static char ID;
- AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
- FunctionPass(ID), TM(TM_) {}
+ AMDGPUPromoteAlloca() : FunctionPass(ID) {}
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
@@ -119,30 +119,31 @@ public:
char AMDGPUPromoteAlloca::ID = 0;
-INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
- "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
- if (!TM)
- return false;
-
Mod = &M;
DL = &Mod->getDataLayout();
- const Triple &TT = TM->getTargetTriple();
-
- IsAMDGCN = TT.getArch() == Triple::amdgcn;
- IsAMDHSA = TT.getOS() == Triple::AMDHSA;
-
return false;
}
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
- if (!TM || skipFunction(F))
+ if (skipFunction(F))
return false;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ TM = &TPC->getTM<TargetMachine>();
+ else
+ return false;
+
+ const Triple &TT = TM->getTargetTriple();
+ IsAMDGCN = TT.getArch() == Triple::amdgcn;
+ IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
if (!ST.isPromoteAllocaEnabled())
return false;
@@ -874,6 +875,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
}
}
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
- return new AMDGPUPromoteAlloca(TM);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+ return new AMDGPUPromoteAlloca();
}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 941f2d8a468a8..b2867fcc49f97 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -14,6 +14,7 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "SIRegisterInfo.h"
using namespace llvm;
@@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
// they are not supported at this time.
//===----------------------------------------------------------------------===//
-// Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
-
-const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
- const MachineFunction *) const {
- return &CalleeSavedReg;
-}
-
-unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- return AMDGPU::NoRegister;
-}
-
unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
static const unsigned SubRegs[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
@@ -50,3 +39,35 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
#define GET_REGINFO_TARGET_DESC
#include "AMDGPUGenRegisterInfo.inc"
+
+
+// Forced to be here by one .inc
+const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
+ const MachineFunction *MF) const {
+ CallingConv::ID CC = MF->getFunction()->getCallingConv();
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return CSR_AMDGPU_HighRegs_SaveList;
+ default: {
+ // Dummy to not crash RegisterClassInfo.
+ static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
+ return &NoCalleeSavedReg;
+ }
+ }
+}
+
+const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::Fast:
+ return CSR_AMDGPU_HighRegs_RegMask;
+ default:
+ return nullptr;
+ }
+}
+
+unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return AMDGPU::NoRegister;
+}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 22b1663821d96..d8604d2590f1f 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -30,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
unsigned getSubRegFromChannel(unsigned Channel) const;
-
- const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
- unsigned getFrameRegister(const MachineFunction &MF) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 386a88b0520fb..a9d3a31a72407 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -570,7 +570,7 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
- addPass(createAMDGPULowerIntrinsicsPass(&TM));
+ addPass(createAMDGPULowerIntrinsicsPass());
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
@@ -585,8 +585,7 @@ void AMDGPUPassConfig::addIRPasses() {
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
// TODO: May want to move later or split into an early and late one.
- addPass(createAMDGPUCodeGenPreparePass(
- static_cast<const GCNTargetMachine *>(&TM)));
+ addPass(createAMDGPUCodeGenPreparePass());
}
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
@@ -594,7 +593,7 @@ void AMDGPUPassConfig::addIRPasses() {
if (TM.getOptLevel() > CodeGenOpt::None) {
addPass(createInferAddressSpacesPass());
- addPass(createAMDGPUPromoteAlloca(&TM));
+ addPass(createAMDGPUPromoteAlloca());
if (EnableSROA)
addPass(createSROAPass());
@@ -664,22 +663,22 @@ bool R600PassConfig::addPreISel() {
}
void R600PassConfig::addPreRegAlloc() {
- addPass(createR600VectorRegMerger(*TM));
+ addPass(createR600VectorRegMerger());
}
void R600PassConfig::addPreSched2() {
addPass(createR600EmitClauseMarkers(), false);
if (EnableR600IfConvert)
addPass(&IfConverterID, false);
- addPass(createR600ClauseMergePass(*TM), false);
+ addPass(createR600ClauseMergePass(), false);
}
void R600PassConfig::addPreEmitPass() {
addPass(createAMDGPUCFGStructurizerPass(), false);
- addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+ addPass(createR600ExpandSpecialInstrsPass(), false);
addPass(&FinalizeMachineBundlesID, false);
- addPass(createR600Packetizer(*TM), false);
- addPass(createR600ControlFlowFinalizer(*TM), false);
+ addPass(createR600Packetizer(), false);
+ addPass(createR600ControlFlowFinalizer(), false);
}
TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
@@ -703,8 +702,7 @@ bool GCNPassConfig::addPreISel() {
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
- const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
- addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+ addPass(createAMDGPUAnnotateKernelFeaturesPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 70c848f3c7bd7..b52ea2b3a2c61 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2796,6 +2796,7 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
+ unsigned OperandIdx[4];
unsigned EnMask = 0;
int SrcIdx = 0;
@@ -2804,15 +2805,18 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
// Add the register arguments
if (Op.isReg()) {
- EnMask |= (1 << SrcIdx);
+ assert(SrcIdx < 4);
+ OperandIdx[SrcIdx] = Inst.size();
Op.addRegOperands(Inst, 1);
++SrcIdx;
continue;
}
if (Op.isOff()) {
- ++SrcIdx;
+ assert(SrcIdx < 4);
+ OperandIdx[SrcIdx] = Inst.size();
Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+ ++SrcIdx;
continue;
}
@@ -2828,6 +2832,22 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
OptionalIdx[Op.getImmTy()] = i;
}
+ assert(SrcIdx == 4);
+
+ bool Compr = false;
+ if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) {
+ Compr = true;
+ Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]);
+ Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister);
+ Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister);
+ }
+
+ for (auto i = 0; i < SrcIdx; ++i) {
+ if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) {
+ EnMask |= Compr? (0x3 << i * 2) : (0x1 << i);
+ }
+ }
+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
@@ -3642,6 +3662,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
{"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
{"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+ {"compr", AMDGPUOperand::ImmTyExpCompr, true, nullptr },
{"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
{"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
{"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 89eddb9ce961f..2aca65ac84303 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen">;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [], 20>;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>;
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4fb03b62bba9a..137b5cca96ce8 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -126,6 +126,7 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
assert(MI.getOpcode() == 0);
assert(MI.getNumOperands() == 0);
MCInst TmpInst;
+ HasLiteral = false;
const auto SavedBytes = Bytes;
if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
MI = TmpInst;
@@ -343,10 +344,15 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
// For now all literal constants are supposed to be unsigned integer
// ToDo: deal with signed/unsigned 64-bit integer constants
// ToDo: deal with float/double constants
- if (Bytes.size() < 4)
- return errOperand(0, "cannot read literal, inst bytes left " +
- Twine(Bytes.size()));
- return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+ if (!HasLiteral) {
+ if (Bytes.size() < 4) {
+ return errOperand(0, "cannot read literal, inst bytes left " +
+ Twine(Bytes.size()));
+ }
+ HasLiteral = true;
+ Literal = eatBytes<uint32_t>(Bytes);
+ }
+ return MCOperand::createImm(Literal);
}
MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d50665187e10b..620bae0a6d1a9 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -39,6 +39,8 @@ class Twine;
class AMDGPUDisassembler : public MCDisassembler {
private:
mutable ArrayRef<uint8_t> Bytes;
+ mutable uint32_t Literal;
+ mutable bool HasLiteral;
public:
AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 8066428fe44aa..18374dca3f840 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
+#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -63,15 +64,6 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
return true;
}
-static GCNRPTracker::LiveRegSet
-stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
- GCNRPTracker::LiveRegSet Res;
- for (const auto &P : LR) {
- if (P.second.any())
- Res.insert(P);
- }
- return Res;
-}
#endif
///////////////////////////////////////////////////////////////////////////////
@@ -185,6 +177,64 @@ void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
}
#endif
+
+static LaneBitmask getDefRegMask(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI) {
+ assert(MO.isDef() && MO.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+ // We don't rely on read-undef flag because in case of tentative schedule
+ // tracking it isn't set correctly yet. This works correctly however since
+ // use mask has been tracked before using LIS.
+ return MO.getSubReg() == 0 ?
+ MRI.getMaxLaneMaskForVReg(MO.getReg()) :
+ MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+static LaneBitmask getUsedRegMask(const MachineOperand &MO,
+ const MachineRegisterInfo &MRI,
+ const LiveIntervals &LIS) {
+ assert(MO.isUse() && MO.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+ if (auto SubReg = MO.getSubReg())
+ return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+ auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
+ if (MaxMask.getAsInteger() == 1) // cannot have subregs
+ return MaxMask;
+
+ // For a tentative schedule LIS isn't updated yet but livemask should remain
+ // the same on any schedule. Subreg defs can be reordered but they all must
+ // dominate uses anyway.
+ auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+ return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);
+}
+
+SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI,
+ const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI) {
+ SmallVector<RegisterMaskPair, 8> Res;
+ for (const auto &MO : MI.operands()) {
+ if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ continue;
+ if (!MO.isUse() || !MO.readsReg())
+ continue;
+
+ auto const UsedMask = getUsedRegMask(MO, MRI, LIS);
+
+ auto Reg = MO.getReg();
+ auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) {
+ return RM.RegUnit == Reg;
+ });
+ if (I != Res.end())
+ I->LaneMask |= UsedMask;
+ else
+ Res.push_back(RegisterMaskPair(Reg, UsedMask));
+ }
+ return Res;
+}
+
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
@@ -222,36 +272,6 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
return LiveRegs;
}
-LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const {
- assert(MO.isDef() && MO.isReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg()));
-
- // We don't rely on read-undef flag because in case of tentative schedule
- // tracking it isn't set correctly yet. This works correctly however since
- // use mask has been tracked before using LIS.
- return MO.getSubReg() == 0 ?
- MRI->getMaxLaneMaskForVReg(MO.getReg()) :
- MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
-}
-
-LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const {
- assert(MO.isUse() && MO.isReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg()));
-
- if (auto SubReg = MO.getSubReg())
- return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
-
- auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
- if (MaxMask.getAsInteger() == 1) // cannot have subregs
- return MaxMask;
-
- // For a tentative schedule LIS isn't updated yet but livemask should remain
- // the same on any schedule. Subreg defs can be reordered but they all must
- // dominate uses anyway.
- auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
- return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
-}
-
void GCNUpwardRPTracker::reset(const MachineInstr &MI,
const LiveRegSet *LiveRegsCopy) {
MRI = &MI.getParent()->getParent()->getRegInfo();
@@ -272,34 +292,40 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
if (MI.isDebugValue())
return;
- // process all defs first to ensure early clobbers are handled correctly
- // iterating over operands() to catch implicit defs
- for (const auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.isDef() ||
- !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
- continue;
+ auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
- auto Reg = MO.getReg();
- auto &LiveMask = LiveRegs[Reg];
- auto PrevMask = LiveMask;
- LiveMask &= ~getDefRegMask(MO);
- CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+ // calc pressure at the MI (defs + uses)
+ auto AtMIPressure = CurPressure;
+ for (const auto &U : RegUses) {
+ auto LiveMask = LiveRegs[U.RegUnit];
+ AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI);
}
+ // update max pressure
+ MaxPressure = max(AtMIPressure, MaxPressure);
- // then all uses
- for (const auto &MO : MI.uses()) {
- if (!MO.isReg() || !MO.readsReg() ||
- !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ for (const auto &MO : MI.defs()) {
+ if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
+ MO.isDead())
continue;
auto Reg = MO.getReg();
- auto &LiveMask = LiveRegs[Reg];
+ auto I = LiveRegs.find(Reg);
+ if (I == LiveRegs.end())
+ continue;
+ auto &LiveMask = I->second;
auto PrevMask = LiveMask;
- LiveMask |= getUsedRegMask(MO);
+ LiveMask &= ~getDefRegMask(MO, *MRI);
CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+ if (LiveMask.none())
+ LiveRegs.erase(I);
}
-
- MaxPressure = max(MaxPressure, CurPressure);
+ for (const auto &U : RegUses) {
+ auto &LiveMask = LiveRegs[U.RegUnit];
+ auto PrevMask = LiveMask;
+ LiveMask |= U.LaneMask;
+ CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+ }
+ assert(CurPressure == getRegPressure(*MRI, LiveRegs));
}
bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
@@ -368,7 +394,7 @@ void GCNDownwardRPTracker::advanceToNext() {
continue;
auto &LiveMask = LiveRegs[Reg];
auto PrevMask = LiveMask;
- LiveMask |= getDefRegMask(MO);
+ LiveMask |= getDefRegMask(MO, *MRI);
CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
}
@@ -430,7 +456,7 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
bool GCNUpwardRPTracker::isValid() const {
const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
- const auto TrackedLR = stripEmpty(LiveRegs);
+ const auto &TrackedLR = LiveRegs;
if (!isEqual(LISLR, TrackedLR)) {
dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 9875ca6a6d161..5dfe44053e728 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -98,8 +98,6 @@ protected:
const MachineInstr *LastTrackedMI = nullptr;
mutable const MachineRegisterInfo *MRI = nullptr;
GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
- LaneBitmask getDefRegMask(const MachineOperand &MO) const;
- LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index d0aba38f786d3..fbe45cb222d93 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -62,7 +62,7 @@ private:
const MachineInstr &LatrCFAlu) const;
public:
- R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+ R600ClauseMergePass() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -208,6 +208,6 @@ StringRef R600ClauseMergePass::getPassName() const {
} // end anonymous namespace
-llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
- return new R600ClauseMergePass(TM);
+llvm::FunctionPass *llvm::createR600ClauseMergePass() {
+ return new R600ClauseMergePass();
}
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 811b905588b4b..09b3287656047 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -499,7 +499,7 @@ private:
}
public:
- R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID) {}
+ R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
ST = &MF.getSubtarget<R600Subtarget>();
@@ -706,6 +706,6 @@ char R600ControlFlowFinalizer::ID = 0;
} // end anonymous namespace
-FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
- return new R600ControlFlowFinalizer(TM);
+FunctionPass *llvm::createR600ControlFlowFinalizer() {
+ return new R600ControlFlowFinalizer();
}
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 3e46e6387614e..5c30a0734f0d8 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -37,7 +37,7 @@ private:
unsigned Op);
public:
- R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+ R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID),
TII(nullptr) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -51,8 +51,8 @@ public:
char R600ExpandSpecialInstrsPass::ID = 0;
-FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
- return new R600ExpandSpecialInstrsPass(TM);
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass() {
+ return new R600ExpandSpecialInstrsPass();
}
void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index d90008a550aeb..502dd3bce97e1 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -124,7 +124,7 @@ private:
public:
static char ID;
- R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+ R600VectorRegMerger() : MachineFunctionPass(ID),
TII(nullptr) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -396,6 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
return false;
}
-llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
- return new R600VectorRegMerger(tm);
+llvm::FunctionPass *llvm::createR600VectorRegMerger() {
+ return new R600VectorRegMerger();
}
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 5b6dd1ed128dc..3e957126b4975 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -36,7 +36,7 @@ class R600Packetizer : public MachineFunctionPass {
public:
static char ID;
- R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+ R600Packetizer() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -404,6 +404,6 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
} // end anonymous namespace
-llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
- return new R600Packetizer(tm);
+llvm::FunctionPass *llvm::createR600Packetizer() {
+ return new R600Packetizer();
}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index dfdc602b80cdf..7501facb0cba1 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
+ const MachineFunction *) const {
+ return &CalleeSavedReg;
+}
+
+unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return AMDGPU::NoRegister;
+}
+
unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
}
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index 9dfb3106c6ccb..f0d9644b02f20 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
R600RegisterInfo();
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
/// \brief get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 1279f845de0e3..97bb0f0c06565 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -189,8 +189,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
// ----
// 13 (+1)
unsigned ReservedRegCount = 13;
- if (SPReg != AMDGPU::NoRegister)
- ++ReservedRegCount;
if (AllSGPRs.size() < ReservedRegCount)
return std::make_pair(ScratchWaveOffsetReg, SPReg);
@@ -208,13 +206,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
MFI->setScratchWaveOffsetReg(Reg);
ScratchWaveOffsetReg = Reg;
- } else {
- if (SPReg == AMDGPU::NoRegister)
- break;
-
- MRI.replaceRegWith(SPReg, Reg);
- MFI->setStackPtrOffsetReg(Reg);
- SPReg = Reg;
break;
}
}
@@ -223,8 +214,8 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
return std::make_pair(ScratchWaveOffsetReg, SPReg);
}
-void SIFrameLowering::emitPrologue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
+void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -424,6 +415,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}
}
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (MFI->isEntryFunction())
+ emitEntryFunctionPrologue(MF, MBB);
+}
+
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 7ccd02b3c86a7..e17adbe273614 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -26,6 +26,8 @@ public:
AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~SIFrameLowering() override = default;
+ void emitEntryFunctionPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const;
void emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 286be355bc14f..01c1f78e7ca40 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -914,6 +914,55 @@ SDValue SITargetLowering::lowerKernargMemParameter(
return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
}
+SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+ const SDLoc &SL, SDValue Chain,
+ const ISD::InputArg &Arg) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (Arg.Flags.isByVal()) {
+ unsigned Size = Arg.Flags.getByValSize();
+ int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
+ return DAG.getFrameIndex(FrameIdx, MVT::i32);
+ }
+
+ unsigned ArgOffset = VA.getLocMemOffset();
+ unsigned ArgSize = VA.getValVT().getStoreSize();
+
+ int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
+
+ // Create load nodes to retrieve arguments from the stack.
+ SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+ SDValue ArgValue;
+
+ // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+ ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+ MVT MemVT = VA.getValVT();
+
+ switch (VA.getLocInfo()) {
+ default:
+ break;
+ case CCValAssign::BCvt:
+ MemVT = VA.getLocVT();
+ break;
+ case CCValAssign::SExt:
+ ExtType = ISD::SEXTLOAD;
+ break;
+ case CCValAssign::ZExt:
+ ExtType = ISD::ZEXTLOAD;
+ break;
+ case CCValAssign::AExt:
+ ExtType = ISD::EXTLOAD;
+ break;
+ }
+
+ ArgValue = DAG.getExtLoad(
+ ExtType, SL, VA.getLocVT(), Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ MemVT);
+ return ArgValue;
+}
+
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
@@ -1094,10 +1143,12 @@ static void allocateSystemSGPRs(CCState &CCInfo,
static void reservePrivateMemoryRegs(const TargetMachine &TM,
MachineFunction &MF,
const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+ SIMachineFunctionInfo &Info,
+ bool NeedSP) {
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
- bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool HasStackObjects = MFI.hasStackObjects();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
@@ -1155,6 +1206,15 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
}
}
+
+ if (NeedSP){
+ unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
+ Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+ assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
+ assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
+ Info.getStackPtrOffsetReg()));
+ }
}
SDValue SITargetLowering::LowerFormalArguments(
@@ -1223,8 +1283,10 @@ SDValue SITargetLowering::LowerFormalArguments(
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
!Info->hasWorkItemIDZ());
+ } else if (IsKernel) {
+ assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
+ Splits.append(Ins.begin(), Ins.end());
}
if (IsEntryFunc) {
@@ -1278,11 +1340,14 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Arg);
continue;
+ } else if (!IsEntryFunc && VA.isMemLoc()) {
+ SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
+ InVals.push_back(Val);
+ if (!Arg.Flags.isByVal())
+ Chains.push_back(Val.getValue(1));
+ continue;
}
- if (VA.isMemLoc())
- report_fatal_error("memloc not supported with calling convention");
-
assert(VA.isRegLoc() && "Parameter must be in a register!");
unsigned Reg = VA.getLocReg();
@@ -1291,7 +1356,7 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
- if (Arg.VT.isVector()) {
+ if (IsShader && Arg.VT.isVector()) {
// Build a vector from the registers
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
@@ -1317,16 +1382,49 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+ // TODO: Could maybe omit SP if only tail calls?
+ bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
+
// Start adding system SGPRs.
- if (IsEntryFunc)
+ if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
-
- reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+ reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
+ } else {
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+ CCInfo.AllocateReg(Info->getFrameOffsetReg());
+
+ if (NeedSP) {
+ unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
+ CCInfo.AllocateReg(StackPtrReg);
+ Info->setStackPtrOffsetReg(StackPtrReg);
+ }
+ }
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
+// TODO: If return values can't fit in registers, we should return as many as
+// possible in registers before passing on stack.
+bool SITargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv,
+ MachineFunction &MF, bool IsVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const {
+ // Replacing returns with sret/stack usage doesn't make sense for shaders.
+ // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
+ // for shaders. Vector types should be explicitly handled by CC.
+ if (AMDGPU::isEntryFunctionCC(CallConv))
+ return true;
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+ return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+}
+
SDValue
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -1336,11 +1434,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- if (!AMDGPU::isShader(CallConv))
+ if (AMDGPU::isKernel(CallConv)) {
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
OutVals, DL, DAG);
+ }
+
+ bool IsShader = AMDGPU::isShader(CallConv);
Info->setIfReturnsVoid(Outs.size() == 0);
+ bool IsWaveEnd = Info->returnsVoid() && IsShader;
SmallVector<ISD::OutputArg, 48> Splits;
SmallVector<SDValue, 48> SplitVals;
@@ -1349,7 +1451,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
const ISD::OutputArg &Out = Outs[i];
- if (Out.VT.isVector()) {
+ if (IsShader && Out.VT.isVector()) {
MVT VT = Out.VT.getVectorElementType();
ISD::OutputArg NewOut = Out;
NewOut.Flags.setSplit();
@@ -1380,29 +1482,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
*DAG.getContext());
// Analyze outgoing return values.
- AnalyzeReturn(CCInfo, Splits);
+ CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Add return address for callable functions.
+ if (!Info->isEntryFunction()) {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+ SDValue ReturnAddrReg = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+ // FIXME: Should be able to use a vreg here, but need a way to prevent it
+ // from being allcoated to a CSR.
+
+ SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+ MVT::i64);
+
+ Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
+ Flag = Chain.getValue(1);
+
+ RetOps.push_back(PhysReturnAddrReg);
+ }
+
// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
+ // TODO: Partially return in registers if return values don't fit.
SDValue Arg = SplitVals[realRVLocIdx];
// Copied from other backends.
switch (VA.getLocInfo()) {
- default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
}
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
@@ -1410,12 +1541,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
+ // FIXME: Does sret work properly?
+
// Update chain and glue.
RetOps[0] = Chain;
if (Flag.getNode())
RetOps.push_back(Flag);
- unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
+ unsigned Opc = AMDGPUISD::ENDPGM;
+ if (!IsWaveEnd)
+ Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -2660,6 +2795,15 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
+ DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+
+ // Make sure we we do any optimizations that will make it easier to fold
+ // source modifiers before obscuring it with bit operations.
+
+ // XXX - Why doesn't this get called when vector_shuffle is expanded?
+ if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
+ return Combined;
+
if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 046e677756d12..e68837747491d 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
uint64_t Offset, bool Signed,
const ISD::InputArg *Arg = nullptr) const;
+ SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+ const SDLoc &SL, SDValue Chain,
+ const ISD::InputArg &Arg) const;
+
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -177,7 +181,12 @@ public:
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
- SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ bool CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index b83a1fe187eb7..02c9b4b1f0eeb 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -228,10 +228,10 @@ class EXPe : Enc64 {
bits<1> compr;
bits<1> done;
bits<1> vm;
- bits<8> vsrc0;
- bits<8> vsrc1;
- bits<8> vsrc2;
- bits<8> vsrc3;
+ bits<8> src0;
+ bits<8> src1;
+ bits<8> src2;
+ bits<8> src3;
let Inst{3-0} = en;
let Inst{9-4} = tgt;
@@ -239,10 +239,10 @@ class EXPe : Enc64 {
let Inst{11} = done;
let Inst{12} = vm;
let Inst{31-26} = 0x3e;
- let Inst{39-32} = vsrc0;
- let Inst{47-40} = vsrc1;
- let Inst{55-48} = vsrc2;
- let Inst{63-56} = vsrc3;
+ let Inst{39-32} = src0;
+ let Inst{47-40} = src1;
+ let Inst{55-48} = src2;
+ let Inst{63-56} = src3;
}
let Uses = [EXEC] in {
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 933a16646746d..c6ad61a325ccd 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -97,9 +97,7 @@ private:
public:
static char ID;
- SILoadStoreOptimizer() : MachineFunctionPass(ID) {}
-
- SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
+ SILoadStoreOptimizer() : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
}
@@ -129,8 +127,8 @@ char SILoadStoreOptimizer::ID = 0;
char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
-FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
- return new SILoadStoreOptimizer(TM);
+FunctionPass *llvm::createSILoadStoreOptimizerPass() {
+ return new SILoadStoreOptimizer();
}
static void moveInstsAfter(MachineBasicBlock::iterator I,
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index adebb8c4a1c5b..18b197ddb7ae7 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -80,17 +80,22 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
WavesPerEU = ST.getWavesPerEU(*F);
- // Non-entry functions have no special inputs for now.
- // TODO: Return early for non-entry CCs.
+ if (!isEntryFunction()) {
+ // Non-entry functions have no special inputs for now, other registers
+ // required for scratch access.
+ ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+ ScratchWaveOffsetReg = AMDGPU::SGPR4;
+ FrameOffsetReg = AMDGPU::SGPR5;
+ return;
+ }
CallingConv::ID CC = F->getCallingConv();
- if (CC == CallingConv::AMDGPU_PS)
- PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
-
- if (AMDGPU::isKernel(CC)) {
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
KernargSegmentPtr = true;
WorkGroupIDX = true;
WorkItemIDX = true;
+ } else if (CC == CallingConv::AMDGPU_PS) {
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
}
if (ST.debuggerEmitPrologue()) {
@@ -120,7 +125,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
bool MaySpill = ST.isVGPRSpillingEnabled(*F);
- bool HasStackObjects = FrameInfo.hasStackObjects();
+ bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
if (HasStackObjects || MaySpill) {
PrivateSegmentWaveByteOffset = true;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc9f509e60ae2..348bb4fa0260c 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -388,9 +388,8 @@ public:
void setScratchWaveOffsetReg(unsigned Reg) {
assert(Reg != AMDGPU::NoRegister && "Should never be unset");
ScratchWaveOffsetReg = Reg;
-
- // FIXME: Only for entry functions.
- FrameOffsetReg = ScratchWaveOffsetReg;
+ if (isEntryFunction())
+ FrameOffsetReg = ScratchWaveOffsetReg;
}
unsigned getQueuePtrUserSGPR() const {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index e02c2e3240e84..4dc090d9b7edc 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include <unordered_map>
+#include <unordered_set>
using namespace llvm;
@@ -44,26 +45,29 @@ namespace {
class SDWAOperand;
class SIPeepholeSDWA : public MachineFunctionPass {
+public:
+ typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;
+
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+ std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
Optional<int64_t> foldToImm(const MachineOperand &Op) const;
public:
static char ID;
- typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
-
SIPeepholeSDWA() : MachineFunctionPass(ID) {
initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
+ bool isConvertibleToSDWA(const MachineInstr &MI) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -468,7 +472,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
auto SDWADst =
- make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+ make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
SDWAOperands[&MI] = std::move(SDWADst);
++NumSDWAPatternsFound;
@@ -575,8 +579,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
}
}
-bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
- const SDWAOperandsVector &SDWAOperands) {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
// Check if this instruction can be converted to SDWA:
// 1. Does this opcode support SDWA
if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
@@ -588,6 +591,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
return false;
}
+ return true;
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+ const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
assert(SDWAOpcode != -1);
@@ -664,7 +672,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// Apply all sdwa operand pattenrs
bool Converted = false;
for (auto &Operand : SDWAOperands) {
- Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+ // There should be no intesection between SDWA operands and potential MIs
+ // e.g.:
+ // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
+ // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
+ // v_add_u32 v3, v4, v2
+ //
+ // In that example it is possible that we would fold 2nd instruction into 3rd
+ // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
+ // already destroyed). So if SDWAOperand is also a potential MI then do not
+ // apply it.
+ if (PotentialMatches.count(Operand->getParentInst()) == 0)
+ Converted |= Operand->convertToSDWA(*SDWAInst, TII);
}
if (!Converted) {
SDWAInst->eraseFromParent();
@@ -690,16 +709,15 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
-
- std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
-
+
+ // Find all SDWA operands in MF.
matchSDWAOperands(MF);
- for (auto &OperandPair : SDWAOperands) {
- auto &Operand = OperandPair.second;
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI) {
- PotentialMatches[PotentialMI].push_back(std::move(Operand));
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+ PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
@@ -708,6 +726,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
convertToSDWA(PotentialMI, PotentialPair.second);
}
+ PotentialMatches.clear();
SDWAOperands.clear();
return false;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 06cfc95be96a5..6fb01a09fe13c 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -117,11 +117,7 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
- const MachineFunction &MF) const {
-
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- unsigned RegCount = ST.getMaxNumSGPRs(MF);
+static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned Reg;
// Try to place it in a hole after PrivateSegmentBufferReg.
@@ -134,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
// wave offset before it.
Reg = RegCount - 5;
}
+
+ return Reg;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
+unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
+ const MachineFunction &MF) const {
+ return AMDGPU::SGPR32;
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -198,15 +207,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
+ unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+ if (StackPtrReg != AMDGPU::NoRegister) {
+ reserveRegisterTuples(Reserved, StackPtrReg);
+ assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
+ }
+
+ unsigned FrameReg = MFI->getFrameOffsetReg();
+ if (FrameReg != AMDGPU::NoRegister) {
+ reserveRegisterTuples(Reserved, FrameReg);
+ assert(!isSubRegister(ScratchRSrcReg, FrameReg));
+ }
+
return Reserved;
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
- return Fn.getFrameInfo().hasStackObjects();
+ const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
+ if (Info->isEntryFunction()) {
+ const MachineFrameInfo &MFI = Fn.getFrameInfo();
+ return MFI.hasStackObjects() || MFI.hasCalls();
+ }
+
+ // May need scavenger for dealing with callee saved registers.
+ return true;
}
-bool
-SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
return MF.getFrameInfo().hasStackObjects();
}
@@ -318,8 +345,11 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
assert(FIOp && FIOp->isFI() && "frame index must be address operand");
-
assert(TII->isMUBUF(MI));
+ assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
+ MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
+ "should only be seeing frame offset relative FrameIndex");
+
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -981,12 +1011,72 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
default: {
- if (TII->isMUBUF(*MI)) {
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsMUBUF = TII->isMUBUF(*MI);
+
+ if (!IsMUBUF &&
+ MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) {
+ // Convert to an absolute stack address by finding the offset from the
+ // scratch wave base and scaling by the wave size.
+ //
+ // In an entry function/kernel the stack address is already the absolute
+ // address relative to the the scratch wave offset.
+
+ unsigned DiffReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
+ unsigned ResultReg = IsCopy ?
+ MI->getOperand(0).getReg() :
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
+ .addReg(MFI->getFrameOffsetReg())
+ .addReg(MFI->getScratchWaveOffsetReg());
+
+ int64_t Offset = FrameInfo.getObjectOffset(Index);
+ if (Offset == 0) {
+ // XXX - This never happens because of emergency scavenging slot at 0?
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
+ .addImm(Log2_32(ST.getWavefrontSize()))
+ .addReg(DiffReg);
+ } else {
+ unsigned CarryOut
+ = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned ScaledReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // XXX - Should this use a vector shift?
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(Log2_32(ST.getWavefrontSize()));
+
+ // TODO: Fold if use instruction is another add of a constant.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+ .addReg(CarryOut, RegState::Define | RegState::Dead)
+ .addImm(Offset)
+ .addReg(ScaledReg, RegState::Kill);
+
+ MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
+ }
+
+ // Don't introduce an extra copy if we're just materializing in a mov.
+ if (IsCopy)
+ MI->eraseFromParent();
+ else
+ FIOp.ChangeToRegister(ResultReg, false, false, true);
+ return;
+ }
+
+ if (IsMUBUF) {
// Disable offen so we don't need a 0 vgpr base.
assert(static_cast<int>(FIOperandNum) ==
AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::vaddr));
+ assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
+ == MFI->getFrameOffsetReg());
+
int64_t Offset = FrameInfo.getObjectOffset(Index);
int64_t OldImm
= TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
@@ -995,17 +1085,19 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (isUInt<12>(NewOffset) &&
buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
MI->eraseFromParent();
- break;
+ return;
}
}
+ // If the offset is simply too big, don't convert to a scratch wave offset
+ // relative index.
+
int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
- .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false, false, true);
}
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 679ed229758a0..b91cdddc5520f 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -17,6 +17,7 @@
#include "AMDGPURegisterInfo.h"
#include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@@ -57,8 +58,16 @@ public:
unsigned reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const;
+ unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
+ const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID) const override;
+
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -228,6 +237,11 @@ public:
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
+ unsigned getReturnAddressReg(const MachineFunction &MF) const {
+ // Not a callee saved register.
+ return AMDGPU::SGPR30_SGPR31;
+ }
+
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 593439c2a3cd4..f2d8b6f7b7a4b 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -186,11 +186,23 @@ def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
-let isTerminator = 1, isBarrier = 1,
- isBranch = 1, isIndirectBranch = 1 in {
+let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
+
+let isBranch = 1, isIndirectBranch = 1 in {
def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
+} // End isBranch = 1, isIndirectBranch = 1
+
+let isReturn = 1 in {
+// Define variant marked as return rather than branch.
+def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
+}
+} // End isTerminator = 1, isBarrier = 1
+
+let isCall = 1 in {
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
+>;
}
-def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+
def S_RFE_B64 : SOP1_1 <"s_rfe_b64">;
let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index d565c84bfedaa..2abd4afad3b6c 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -518,7 +518,18 @@ bool isCompute(CallingConv::ID cc) {
}
bool isEntryFunctionCC(CallingConv::ID CC) {
- return true;
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_HS:
+ return true;
+ default:
+ return false;
+ }
}
bool isSI(const MCSubtargetInfo &STI) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d6c836eb748b1..8e74aa2cc9a8b 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -262,7 +262,6 @@ bool isEntryFunctionCC(CallingConv::ID CC);
LLVM_READNONE
inline bool isKernel(CallingConv::ID CC) {
switch (CC) {
- case CallingConv::C:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return true;