aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-01-19 10:01:25 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-01-19 10:01:25 +0000
commitd8e91e46262bc44006913e6796843909f1ac7bcd (patch)
tree7d0c143d9b38190e0fa0180805389da22cd834c5 /lib/Target/AMDGPU
parentb7eb8e35e481a74962664b63dfb09483b200209a (diff)
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h49
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td143
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp85
-rw-r--r--lib/Target/AMDGPU/AMDGPUAliasAnalysis.h31
-rw-r--r--lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp30
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp24
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp106
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp458
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h6
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td6
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp24
-rw-r--r--lib/Target/AMDGPU/AMDGPUFeatures.td9
-rw-r--r--lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp63
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td5
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def78
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp601
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h109
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp366
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp323
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h21
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp23
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td8
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td101
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUIntrinsics.td16
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp135
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp5
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp16
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp20
-rw-r--r--lib/Target/AMDGPU/AMDGPUMacroFusion.cpp5
-rw-r--r--lib/Target/AMDGPU/AMDGPUPTNote.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp55
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp247
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td5
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp34
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h196
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp95
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.h10
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetObjectFile.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp58
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp12
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp182
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td356
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt6
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td4
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td66
-rw-r--r--lib/Target/AMDGPU/GCNDPPCombine.cpp446
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp11
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp3
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNProcessors.td5
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp31
-rw-r--r--lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h8
-rw-r--r--lib/Target/AMDGPU/LLVMBuild.txt2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp6
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp516
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h37
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt2
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td18
-rw-r--r--lib/Target/AMDGPU/R600EmitClauseMarkers.cpp4
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp154
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h8
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp12
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.h2
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td8
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp9
-rw-r--r--lib/Target/AMDGPU/SIAddIMGInit.cpp181
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp101
-rw-r--r--lib/Target/AMDGPU/SIDefines.h5
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp18
-rw-r--r--lib/Target/AMDGPU/SIFixWWMLiveness.cpp385
-rw-r--r--lib/Target/AMDGPU/SIFixupVectorISel.cpp231
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp164
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp19
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp8
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp2266
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h43
-rw-r--r--lib/Target/AMDGPU/SIInsertSkips.cpp99
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp1478
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td6
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp1112
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h116
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td161
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td159
-rw-r--r--lib/Target/AMDGPU/SIIntrinsics.td32
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp1054
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp29
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp824
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp21
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp20
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp25
-rw-r--r--lib/Target/AMDGPU/SIModeRegister.cpp406
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp133
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp97
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp80
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h6
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td12
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp361
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td177
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td96
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp350
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h204
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp75
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h24
-rw-r--r--lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h1
-rw-r--r--lib/Target/AMDGPU/Utils/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td47
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td279
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td184
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td124
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td26
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td117
126 files changed, 12013 insertions, 4897 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 796766d946221..bb7801c172f60 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -37,10 +37,13 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
// SI Passes
+FunctionPass *createGCNDPPCombinePass();
FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIFixupVectorISelPass();
+FunctionPass *createSIAddIMGInitPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
@@ -57,6 +60,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
+FunctionPass *createSIModeRegisterPass();
void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
@@ -69,10 +73,18 @@ Pass *createAMDGPUAnnotateKernelFeaturesPass();
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
+FunctionPass *createAMDGPUAtomicOptimizerPass();
+void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
+extern char &AMDGPUAtomicOptimizerID;
+
ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
+ModulePass *createAMDGPUFixFunctionBitcastsPass();
+void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
+extern char &AMDGPUFixFunctionBitcastsID;
+
FunctionPass *createAMDGPULowerKernelArgumentsPass();
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
extern char &AMDGPULowerKernelArgumentsID;
@@ -84,6 +96,9 @@ extern char &AMDGPULowerKernelAttributesID;
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
+void initializeGCNDPPCombinePass(PassRegistry &);
+extern char &GCNDPPCombineID;
+
void initializeR600ClauseMergePassPass(PassRegistry &);
extern char &R600ClauseMergePassID;
@@ -114,6 +129,9 @@ extern char &SIFixSGPRCopiesID;
void initializeSIFixVGPRCopiesPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
+void initializeSIFixupVectorISelPass(PassRegistry &);
+extern char &SIFixupVectorISelID;
+
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
@@ -141,6 +159,9 @@ extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
+void initializeSIAddIMGInitPass(PassRegistry &);
+extern char &SIAddIMGInitID;
+
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;
@@ -179,6 +200,9 @@ extern char &SIMemoryLegalizerID;
void initializeSIDebuggerInsertNopsPass(PassRegistry&);
extern char &SIDebuggerInsertNopsID;
+void initializeSIModeRegisterPass(PassRegistry&);
+extern char &SIModeRegisterID;
+
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
@@ -190,6 +214,8 @@ extern char &AMDGPUUnifyDivergentExitNodesID;
ImmutablePass *createAMDGPUAAWrapperPass();
void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+ImmutablePass *createAMDGPUExternalAAWrapperPass();
+void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
@@ -221,19 +247,18 @@ enum TargetIndex {
/// however on the GPU, each address space points to
/// a separate piece of memory that is unique from other
/// memory locations.
-struct AMDGPUAS {
- // The following address space values depend on the triple environment.
- unsigned PRIVATE_ADDRESS; ///< Address space for private memory.
- unsigned FLAT_ADDRESS; ///< Address space for flat memory.
- unsigned REGION_ADDRESS; ///< Address space for region memory.
-
+namespace AMDGPUAS {
enum : unsigned {
// The maximum value for flat, generic, local, private, constant and region.
- MAX_COMMON_ADDRESS = 5,
+ MAX_AMDGPU_ADDRESS = 6,
+ FLAT_ADDRESS = 0, ///< Address space for flat memory.
GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
+ REGION_ADDRESS = 2, ///< Address space for region memory.
+
CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
LOCAL_ADDRESS = 3, ///< Address space for local memory.
+ PRIVATE_ADDRESS = 5, ///< Address space for private memory.
CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
@@ -268,14 +293,6 @@ struct AMDGPUAS {
// Some places use this if the address space can't be determined.
UNKNOWN_ADDRESS_SPACE = ~0u,
};
-};
-
-namespace llvm {
-namespace AMDGPU {
-AMDGPUAS getAMDGPUAS(const Module &M);
-AMDGPUAS getAMDGPUAS(const TargetMachine &TM);
-AMDGPUAS getAMDGPUAS(Triple T);
-} // namespace AMDGPU
-} // namespace llvm
+}
#endif
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 16c2a366db285..6a4cfe08e4910 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td"
include "llvm/Target/Target.td"
include "AMDGPUFeatures.td"
+class BoolToList<bit Value> {
+ list<int> ret = !if(Value, [1]<int>, []<int>);
+}
+
//===------------------------------------------------------------===//
// Subtarget Features (device properties)
//===------------------------------------------------------------===//
@@ -140,6 +144,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
"Additional instructions for CI+"
>;
+def FeatureVIInsts : SubtargetFeature<"vi-insts",
+ "VIInsts",
+ "true",
+ "Additional instructions for VI+"
+>;
+
def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
"GFX9Insts",
"true",
@@ -236,6 +246,12 @@ def FeatureDPP : SubtargetFeature<"dpp",
"Support DPP (Data Parallel Primitives) extension"
>;
+def FeatureR128A16 : SubtargetFeature<"r128-a16",
+ "HasR128A16",
+ "true",
+ "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
+>;
+
def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"HasIntClamp",
"true",
@@ -251,31 +267,25 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
def FeatureDLInsts : SubtargetFeature<"dl-insts",
"HasDLInsts",
"true",
- "Has deep learning instructions"
+ "Has v_fmac_f32 and v_xnor_b32 instructions"
>;
-def FeatureD16PreservesUnusedBits : SubtargetFeature<
- "d16-preserves-unused-bits",
- "D16PreservesUnusedBits",
+def FeatureDotInsts : SubtargetFeature<"dot-insts",
+ "HasDotInsts",
"true",
- "If present, then instructions defined by HasD16LoadStore predicate preserve "
- "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
- "zero unused bits."
+ "Has v_dot* instructions"
+>;
+
+def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
+ "EnableSRAMECC",
+ "true",
+ "Enable SRAM ECC"
>;
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
- "FP32Denormals",
- "true",
- "Enable single precision denormal handling"
->;
-
// Denormal handling for fp64 and fp16 is controlled by the same
// config register when fp16 supported.
// TODO: Do we need a separate f16 setting when not legal?
@@ -324,12 +334,6 @@ def FeatureEnableHugePrivateBuffer : SubtargetFeature<
"Enable private/scratch buffer sizes greater than 128 GB"
>;
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
- "EnableVGPRSpilling",
- "true",
- "Enable spilling of VGPRs to scratch memory"
->;
-
def FeatureDumpCode : SubtargetFeature <"DumpCode",
"DumpCode",
"true",
@@ -373,6 +377,16 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
"Use ds_{read|write}_b128"
>;
+// Sparse texture support requires that all result registers are zeroed when
+// PRTStrictNull is set to true. This feature is turned on for all architectures
+// but is enabled as a feature in case there are situations where PRTStrictNull
+// is disabled by the driver.
+def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
+ "EnablePRTStrictNull",
+ "true",
+ "Enable zeroing of result registers for sparse texture fetches"
+>;
+
// Unless +-flat-for-global is specified, turn on FlatForGlobal for
// all OS-es on VI and newer hardware to avoid assertion failures due
// to missing ADDR64 variants of MUBUF instructions.
@@ -399,6 +413,12 @@ def FeatureCodeObjectV3 : SubtargetFeature <
"Generate code object version 3"
>;
+def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
+ "HasTrigReducedRange",
+ "true",
+ "Requires use of fract on arguments to trig instructions"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -418,36 +438,36 @@ class GCNSubtargetFeatureGeneration <string Value,
def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureGCN,
- FeatureLDSBankCount32, FeatureMovrel]
+ FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
- FeatureCIInsts, FeatureMovrel]
+ FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+ FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
- FeatureIntClamp
+ FeatureIntClamp, FeatureTrigReducedRange
]
>;
def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
[FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+ FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
- FeatureAddNoCarryInsts, FeatureScalarAtomics
+ FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16
]
>;
@@ -465,34 +485,41 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
[FeatureSouthernIslands,
FeatureFastFMAF32,
HalfRate64Ops,
- FeatureLDSBankCount32]>;
+ FeatureLDSBankCount32,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
[FeatureSouthernIslands,
- FeatureLDSBankCount32]>;
+ FeatureLDSBankCount32,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
[FeatureSeaIslands,
- FeatureLDSBankCount32]>;
+ FeatureLDSBankCount32,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
[FeatureSeaIslands,
HalfRate64Ops,
FeatureLDSBankCount32,
- FeatureFastFMAF32]>;
+ FeatureFastFMAF32,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
[FeatureSeaIslands,
FeatureLDSBankCount16,
- FeatureFastFMAF32]>;
+ FeatureFastFMAF32,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
[FeatureSeaIslands,
- FeatureLDSBankCount16]>;
+ FeatureLDSBankCount16,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4,
[FeatureSeaIslands,
- FeatureLDSBankCount32]>;
+ FeatureLDSBankCount32,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
[FeatureVolcanicIslands,
@@ -500,49 +527,63 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
HalfRate64Ops,
FeatureLDSBankCount32,
FeatureXNACK,
- FeatureUnpackedD16VMem]>;
+ FeatureUnpackedD16VMem,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
FeatureSGPRInitBug,
- FeatureUnpackedD16VMem]>;
+ FeatureUnpackedD16VMem,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
[FeatureVolcanicIslands,
FeatureLDSBankCount32,
- FeatureUnpackedD16VMem]>;
+ FeatureUnpackedD16VMem,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
[FeatureVolcanicIslands,
FeatureLDSBankCount16,
- FeatureXNACK]>;
+ FeatureXNACK,
+ FeatureCodeObjectV3]>;
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureD16PreservesUnusedBits]>;
+ FeatureCodeObjectV3]>;
def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
FeatureXNACK,
- FeatureD16PreservesUnusedBits]>;
+ FeatureCodeObjectV3]>;
def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
[FeatureGFX9,
FeatureLDSBankCount32,
FeatureFmaMixInsts,
- FeatureD16PreservesUnusedBits]>;
+ FeatureCodeObjectV3]>;
def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
[FeatureGFX9,
HalfRate64Ops,
FeatureFmaMixInsts,
FeatureLDSBankCount32,
- FeatureDLInsts]>;
+ FeatureDLInsts,
+ FeatureDotInsts,
+ FeatureSRAMECC,
+ FeatureCodeObjectV3]>;
+
+def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
+ [FeatureGFX9,
+ FeatureMadMixInsts,
+ FeatureLDSBankCount32,
+ FeatureXNACK,
+ FeatureCodeObjectV3]>;
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
@@ -674,8 +715,9 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
-def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
- AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
+def D16PreservesUnusedBits :
+ Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+ AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
@@ -683,10 +725,10 @@ def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<"FeatureGFX9Insts">;
-def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">,
+def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
AssemblerPredicate<"FeatureAddNoCarryInsts">;
-def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">,
+def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">,
AssemblerPredicate<"!FeatureAddNoCarryInsts">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
@@ -706,6 +748,9 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
def HasDPP : Predicate<"Subtarget->hasDPP()">,
AssemblerPredicate<"FeatureDPP">;
+def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
+ AssemblerPredicate<"FeatureR128A16">;
+
def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
AssemblerPredicate<"FeatureIntClamp">;
@@ -728,6 +773,9 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
AssemblerPredicate<"FeatureDLInsts">;
+def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">,
+ AssemblerPredicate<"FeatureDotInsts">;
+
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
@@ -736,7 +784,6 @@ def EnableLateCFGStructurize : Predicate<
include "SISchedule.td"
include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
-include "AMDGPUIntrinsics.td"
include "SIIntrinsics.td"
include "AMDGPURegisterInfo.td"
include "AMDGPURegisterBanks.td"
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index ef4b69d09d9f5..73709ba13643e 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -34,69 +34,45 @@ using namespace llvm;
// Register this pass...
char AMDGPUAAWrapperPass::ID = 0;
+char AMDGPUExternalAAWrapper::ID = 0;
INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
"AMDGPU Address space based Alias Analysis", false, true)
+INITIALIZE_PASS(AMDGPUExternalAAWrapper, "amdgpu-aa-wrapper",
+ "AMDGPU Address space based Alias Analysis Wrapper", false, true)
+
ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
return new AMDGPUAAWrapperPass();
}
+ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() {
+ return new AMDGPUExternalAAWrapper();
+}
+
void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
}
-// Must match the table in getAliasResult.
-AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_)
- : Arch(Arch_), AS(AS_) {
- // These arrarys are indexed by address space value
- // enum elements 0 ... to 5
- static const AliasResult ASAliasRulesPrivIsZero[6][6] = {
- /* Private Global Constant Group Flat Region*/
- /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
- /* Global */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias},
- /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias},
- /* Group */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias},
- /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
- /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
- };
- static const AliasResult ASAliasRulesGenIsZero[6][6] = {
- /* Flat Global Region Group Constant Private */
- /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
- /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
- /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias},
- /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
- /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias},
- /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias}
- };
- assert(AS.MAX_COMMON_ADDRESS <= 5);
- if (AS.FLAT_ADDRESS == 0) {
- assert(AS.GLOBAL_ADDRESS == 1 &&
- AS.REGION_ADDRESS == 2 &&
- AS.LOCAL_ADDRESS == 3 &&
- AS.CONSTANT_ADDRESS == 4 &&
- AS.PRIVATE_ADDRESS == 5);
- ASAliasRules = &ASAliasRulesGenIsZero;
- } else {
- assert(AS.PRIVATE_ADDRESS == 0 &&
- AS.GLOBAL_ADDRESS == 1 &&
- AS.CONSTANT_ADDRESS == 2 &&
- AS.LOCAL_ADDRESS == 3 &&
- AS.FLAT_ADDRESS == 4 &&
- AS.REGION_ADDRESS == 5);
- ASAliasRules = &ASAliasRulesPrivIsZero;
- }
-}
+// These arrays are indexed by address space value enum elements 0 ... to 6
+static const AliasResult ASAliasRules[7][7] = {
+ /* Flat Global Region Group Constant Private Constant 32-bit */
+ /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+ /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias},
+ /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias , MayAlias},
+ /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias},
+ /* Constant */ {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias, NoAlias , MayAlias},
+ /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
+ /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias}
+};
-AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1,
- unsigned AS2) const {
- if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) {
- if (Arch == Triple::amdgcn)
- report_fatal_error("Pointer address space out of range");
- return AS1 == AS2 ? MayAlias : NoAlias;
- }
+static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
+ static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range");
- return (*ASAliasRules)[AS1][AS2];
+ if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
+ return MayAlias;
+
+ return ASAliasRules[AS1][AS2];
}
AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
@@ -104,8 +80,9 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
- AliasResult Result = ASAliasRules.getAliasResult(asA, asB);
- if (Result == NoAlias) return Result;
+ AliasResult Result = getAliasResult(asA, asB);
+ if (Result == NoAlias)
+ return Result;
// Forward the query to the next alias analysis.
return AAResultBase::alias(LocA, LocB);
@@ -114,9 +91,9 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
bool OrLocal) {
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
-
- if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS ||
- Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) {
+ unsigned AS = Base->getType()->getPointerAddressSpace();
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 645a38af753ce..d76c9fc481995 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -33,14 +33,12 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
friend AAResultBase<AMDGPUAAResult>;
const DataLayout &DL;
- AMDGPUAS AS;
public:
explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
- DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {}
+ DL(DL) {}
AMDGPUAAResult(AMDGPUAAResult &&Arg)
- : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS),
- ASAliasRules(Arg.ASAliasRules){}
+ : AAResultBase(std::move(Arg)), DL(Arg.DL) {}
/// Handle invalidation events from the new pass manager.
///
@@ -53,18 +51,6 @@ public:
private:
bool Aliases(const MDNode *A, const MDNode *B) const;
bool PathAliases(const MDNode *A, const MDNode *B) const;
-
- class ASAliasRulesTy {
- public:
- ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_);
-
- AliasResult getAliasResult(unsigned AS1, unsigned AS2) const;
-
- private:
- Triple::ArchType Arch;
- AMDGPUAS AS;
- const AliasResult (*ASAliasRules)[6][6];
- } ASAliasRules;
};
/// Analysis pass providing a never-invalidated alias analysis result.
@@ -110,6 +96,19 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override;
};
+// Wrapper around ExternalAAWrapperPass so that the default constructor gets the
+// callback.
+class AMDGPUExternalAAWrapper : public ExternalAAWrapperPass {
+public:
+ static char ID;
+
+ AMDGPUExternalAAWrapper() : ExternalAAWrapperPass(
+ [](Pass &P, Function &, AAResults &AAR) {
+ if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+ AAR.addAAResult(WrapperPass->getResult());
+ }) {}
+};
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index d4bbb2c1eb8d1..fc65430b745f3 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -86,8 +86,6 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers(
}
bool AMDGPUAlwaysInline::runOnModule(Module &M) {
- AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
-
std::vector<GlobalAlias*> AliasesToRemove;
SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
@@ -122,7 +120,7 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
for (GlobalVariable &GV : M.globals()) {
// TODO: Region address
unsigned AS = GV.getType()->getAddressSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
+ if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
continue;
recursivelyVisitUsers(GV, FuncsToAlwaysInline);
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 1a70833a4472f..896ac9c87779e 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -46,7 +46,6 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
private:
const TargetMachine *TM = nullptr;
- AMDGPUAS AS;
bool addFeatureAttributes(Function &F);
@@ -67,11 +66,10 @@ public:
CallGraphSCCPass::getAnalysisUsage(AU);
}
- static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
+ static bool visitConstantExpr(const ConstantExpr *CE);
static bool visitConstantExprsRecursively(
const Constant *EntryC,
- SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
- AMDGPUAS AS);
+ SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
};
} // end anonymous namespace
@@ -85,20 +83,18 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
// The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
- return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+ return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
}
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
- const AMDGPUAS &AS) {
- return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+ return castRequiresQueuePtr(ASC->getSrcAddressSpace());
}
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
- AMDGPUAS AS) {
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
- return castRequiresQueuePtr(SrcAS, AS);
+ return castRequiresQueuePtr(SrcAS);
}
return false;
@@ -106,8 +102,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
const Constant *EntryC,
- SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
- AMDGPUAS AS) {
+ SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
if (!ConstantExprVisited.insert(EntryC).second)
return false;
@@ -120,7 +115,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
// Check this constant expression.
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
- if (visitConstantExpr(CE, AS))
+ if (visitConstantExpr(CE))
return true;
}
@@ -262,7 +257,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
continue;
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
- if (castRequiresQueuePtr(ASC, AS)) {
+ if (castRequiresQueuePtr(ASC)) {
NeedQueuePtr = true;
continue;
}
@@ -273,7 +268,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
if (!OpC)
continue;
- if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+ if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
NeedQueuePtr = true;
break;
}
@@ -318,7 +313,6 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
if (!TPC)
report_fatal_error("TargetMachine is required");
- AS = AMDGPU::getAMDGPUAS(CG.getModule());
TM = &TPC->getTM<TargetMachine>();
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index ed5370826647f..f88e3b0dac860 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -16,7 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/IR/IRBuilder.h"
@@ -32,12 +32,11 @@ namespace {
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
- DivergenceAnalysis *DA;
+ LegacyDivergenceAnalysis *DA;
MemoryDependenceResults *MDR;
LoopInfo *LI;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isKernelFunc;
- AMDGPUAS AMDGPUASI;
public:
static char ID;
@@ -49,7 +48,7 @@ public:
return "AMDGPU Annotate Uniform Values";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
AU.addRequired<MemoryDependenceWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.setPreservesAll();
@@ -64,7 +63,7 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
@@ -118,14 +117,8 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
- if (I.isUnconditional())
- return;
-
- Value *Cond = I.getCondition();
- if (!DA->isUniform(Cond))
- return;
-
- setUniformMetadata(I.getParent()->getTerminator());
+ if (DA->isUniform(&I))
+ setUniformMetadata(I.getParent()->getTerminator());
}
void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
@@ -133,7 +126,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
if (!DA->isUniform(Ptr))
return;
auto isGlobalLoad = [&](LoadInst &Load)->bool {
- return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+ return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
};
// We're tracking up to the Function boundaries
// We cannot go beyond because of FunctionPass restrictions
@@ -168,7 +161,6 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
}
bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
- AMDGPUASI = AMDGPU::getAMDGPUAS(M);
return false;
}
@@ -176,7 +168,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- DA = &getAnalysis<DivergenceAnalysis>();
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index e62e5d52ad74f..2ded7cdb64899 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -40,11 +40,13 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
// TODO: This should get the default rounding mode from the kernel. We just set
// the default here, but this could change if the OpenCL rounding mode pragmas
@@ -98,8 +100,11 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
- AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
- }
+ if (IsaInfo::hasCodeObjectV3(getSTI()))
+ HSAMetadataStream.reset(new MetadataStreamerV3());
+ else
+ HSAMetadataStream.reset(new MetadataStreamerV2());
+}
StringRef AMDGPUAsmPrinter::getPassName() const {
return "AMDGPU Assembly Printer";
@@ -116,62 +121,70 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
}
void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
- if (IsaInfo::hasCodeObjectV3(getSTI()) &&
- TM.getTargetTriple().getOS() == Triple::AMDHSA)
- return;
+ if (IsaInfo::hasCodeObjectV3(getSTI())) {
+ std::string ExpectedTarget;
+ raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+ IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
+
+ getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
+ }
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
- HSAMetadataStream.begin(M);
+ HSAMetadataStream->begin(M);
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
readPALMetadata(M);
+ if (IsaInfo::hasCodeObjectV3(getSTI()))
+ return;
+
// HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
// HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
- IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
+ IsaVersion Version = getIsaVersion(getSTI()->getCPU());
getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
- ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
+ Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
- // TODO: Add metadata to code object v3.
- if (IsaInfo::hasCodeObjectV3(getSTI()) &&
- TM.getTargetTriple().getOS() == Triple::AMDHSA)
- return;
-
// Following code requires TargetStreamer to be present.
if (!getTargetStreamer())
return;
- // Emit ISA Version (NT_AMD_AMDGPU_ISA).
- std::string ISAVersionString;
- raw_string_ostream ISAVersionStream(ISAVersionString);
- IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
- getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+ if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+ // Emit ISA Version (NT_AMD_AMDGPU_ISA).
+ std::string ISAVersionString;
+ raw_string_ostream ISAVersionStream(ISAVersionString);
+ IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
+ getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
+ }
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
- HSAMetadataStream.end();
- getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
+ HSAMetadataStream->end();
+ bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
+ (void)Success;
+ assert(Success && "Malformed HSA Metadata");
}
- // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
- if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
- // Copy the PAL metadata from the map where we collected it into a vector,
- // then write it as a .note.
- PALMD::Metadata PALMetadataVector;
- for (auto i : PALMetadataMap) {
- PALMetadataVector.push_back(i.first);
- PALMetadataVector.push_back(i.second);
+ if (!IsaInfo::hasCodeObjectV3(getSTI())) {
+ // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
+ if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
+ // Copy the PAL metadata from the map where we collected it into a vector,
+ // then write it as a .note.
+ PALMD::Metadata PALMetadataVector;
+ for (auto i : PALMetadataMap) {
+ PALMetadataVector.push_back(i.first);
+ PALMetadataVector.push_back(i.second);
+ }
+ getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
}
- getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
}
}
@@ -193,13 +206,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
return;
- if (IsaInfo::hasCodeObjectV3(getSTI()) &&
- TM.getTargetTriple().getOS() == Triple::AMDHSA)
- return;
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
const Function &F = MF->getFunction();
- if (STM.isAmdCodeObjectV2(F) &&
+ if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
amd_kernel_code_t KernelCode;
@@ -207,10 +217,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
}
- if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
- return;
-
- HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
+ if (STM.isAmdHsaOS())
+ HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
}
void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
@@ -241,7 +249,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
*getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
- IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(),
+ IsaInfo::getNumExtraSGPRs(getSTI(),
CurrentProgramInfo.VCCUsed,
CurrentProgramInfo.FlatUsed),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
@@ -259,7 +267,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
- if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
+ if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
SmallString<128> SymbolName;
getNameWithPrefix(SymbolName, &MF->getFunction()),
getTargetStreamer()->EmitAMDGPUSymbolType(
@@ -562,7 +570,7 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
const GCNSubtarget &ST) const {
- return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(),
+ return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
UsesVCC, UsesFlatScratch);
}
@@ -759,7 +767,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
// 48 SGPRs - vcc, - flat_scr, -xnack
int MaxSGPRGuess =
- 47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true,
+ 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
@@ -824,7 +832,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
- STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
+ getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -906,9 +914,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
- STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU);
+ &STM, ProgInfo.NumSGPRsForWavesPerEU);
ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
- STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU);
+ &STM, ProgInfo.NumVGPRsForWavesPerEU);
// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
@@ -1003,7 +1011,6 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
@@ -1024,10 +1031,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->EmitIntValue(RsrcReg, 4);
OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
- if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
- OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
- OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
- }
+ OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+ OutStreamer->EmitIntValue(
+ S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1138,7 +1144,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
+ AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
Out.compute_pgm_resource_registers =
CurrentProgramInfo.ComputePGMRSrc1 |
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 22982d912c708..167ac4b21e1e2 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -56,7 +56,7 @@ private:
SIProgramInfo CurrentProgramInfo;
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
- AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream;
+ std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
std::map<uint32_t, uint32_t> PALMetadataMap;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
@@ -143,7 +143,6 @@ public:
protected:
mutable std::vector<std::string> DisasmLines, HexLines;
mutable size_t DisasmLineMaxLen;
- AMDGPUAS AMDGPUASI;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
new file mode 100644
index 0000000000000..644e4fd558bad
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -0,0 +1,458 @@
+//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass optimizes atomic operations by using a single lane of a wavefront
+/// to perform the atomic operation, thus reducing contention on that memory
+/// location.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-atomic-optimizer"
+
+using namespace llvm;
+
+namespace {
+
+enum DPP_CTRL {
+ DPP_ROW_SR1 = 0x111,
+ DPP_ROW_SR2 = 0x112,
+ DPP_ROW_SR4 = 0x114,
+ DPP_ROW_SR8 = 0x118,
+ DPP_WF_SR1 = 0x138,
+ DPP_ROW_BCAST15 = 0x142,
+ DPP_ROW_BCAST31 = 0x143
+};
+
+struct ReplacementInfo {
+ Instruction *I;
+ Instruction::BinaryOps Op;
+ unsigned ValIdx;
+ bool ValDivergent;
+};
+
+class AMDGPUAtomicOptimizer : public FunctionPass,
+ public InstVisitor<AMDGPUAtomicOptimizer> {
+private:
+ SmallVector<ReplacementInfo, 8> ToReplace;
+ const LegacyDivergenceAnalysis *DA;
+ const DataLayout *DL;
+ DominatorTree *DT;
+ bool HasDPP;
+ bool IsPixelShader;
+
+ void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
+ unsigned ValIdx, bool ValDivergent) const;
+
+ void setConvergent(CallInst *const CI) const;
+
+public:
+ static char ID;
+
+ AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<TargetPassConfig>();
+ }
+
+ void visitAtomicRMWInst(AtomicRMWInst &I);
+ void visitIntrinsicInst(IntrinsicInst &I);
+};
+
+} // namespace
+
+char AMDGPUAtomicOptimizer::ID = 0;
+
+char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
+
+bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
+ if (skipFunction(F)) {
+ return false;
+ }
+
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ DL = &F.getParent()->getDataLayout();
+ DominatorTreeWrapperPass *const DTW =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTW ? &DTW->getDomTree() : nullptr;
+ const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ HasDPP = ST.hasDPP();
+ IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ visit(F);
+
+ const bool Changed = !ToReplace.empty();
+
+ for (ReplacementInfo &Info : ToReplace) {
+ optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
+ }
+
+ ToReplace.clear();
+
+ return Changed;
+}
+
+void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
+ // Early exit for unhandled address space atomic instructions.
+ switch (I.getPointerAddressSpace()) {
+ default:
+ return;
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::LOCAL_ADDRESS:
+ break;
+ }
+
+ Instruction::BinaryOps Op;
+
+ switch (I.getOperation()) {
+ default:
+ return;
+ case AtomicRMWInst::Add:
+ Op = Instruction::Add;
+ break;
+ case AtomicRMWInst::Sub:
+ Op = Instruction::Sub;
+ break;
+ }
+
+ const unsigned PtrIdx = 0;
+ const unsigned ValIdx = 1;
+
+ // If the pointer operand is divergent, then each lane is doing an atomic
+ // operation on a different address, and we cannot optimize that.
+ if (DA->isDivergent(I.getOperand(PtrIdx))) {
+ return;
+ }
+
+ const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+
+ // If the value operand is divergent, each lane is contributing a different
+ // value to the atomic calculation. We can only optimize divergent values if
+ // we have DPP available on our subtarget, and the atomic operation is 32
+ // bits.
+ if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ return;
+ }
+
+ // If we get here, we can optimize the atomic using a single wavefront-wide
+ // atomic operation to do the calculation for the entire wavefront, so
+ // remember the instruction so we can come back to it.
+ const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
+
+ ToReplace.push_back(Info);
+}
+
+void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
+ Instruction::BinaryOps Op;
+
+ switch (I.getIntrinsicID()) {
+ default:
+ return;
+ case Intrinsic::amdgcn_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ Op = Instruction::Add;
+ break;
+ case Intrinsic::amdgcn_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ Op = Instruction::Sub;
+ break;
+ }
+
+ const unsigned ValIdx = 0;
+
+ const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+
+ // If the value operand is divergent, each lane is contributing a different
+ // value to the atomic calculation. We can only optimize divergent values if
+ // we have DPP available on our subtarget, and the atomic operation is 32
+ // bits.
+ if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ return;
+ }
+
+ // If any of the other arguments to the intrinsic are divergent, we can't
+ // optimize the operation.
+ for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
+ if (DA->isDivergent(I.getOperand(Idx))) {
+ return;
+ }
+ }
+
+ // If we get here, we can optimize the atomic using a single wavefront-wide
+ // atomic operation to do the calculation for the entire wavefront, so
+ // remember the instruction so we can come back to it.
+ const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
+
+ ToReplace.push_back(Info);
+}
+
+void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
+ Instruction::BinaryOps Op,
+ unsigned ValIdx,
+ bool ValDivergent) const {
+ LLVMContext &Context = I.getContext();
+
+ // Start building just before the instruction.
+ IRBuilder<> B(&I);
+
+ // If we are in a pixel shader, because of how we have to mask out helper
+ // lane invocations, we need to record the entry and exit BB's.
+ BasicBlock *PixelEntryBB = nullptr;
+ BasicBlock *PixelExitBB = nullptr;
+
+ // If we're optimizing an atomic within a pixel shader, we need to wrap the
+ // entire atomic operation in a helper-lane check. We do not want any helper
+ // lanes that are around only for the purposes of derivatives to take part
+ // in any cross-lane communication, and we use a branch on whether the lane is
+ // live to do this.
+ if (IsPixelShader) {
+ // Record I's original position as the entry block.
+ PixelEntryBB = I.getParent();
+
+ Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
+ Instruction *const NonHelperTerminator =
+ SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+
+ // Record I's new position as the exit block.
+ PixelExitBB = I.getParent();
+
+ I.moveBefore(NonHelperTerminator);
+ B.SetInsertPoint(&I);
+ }
+
+ Type *const Ty = I.getType();
+ const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
+ Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
+
+ // This is the value in the atomic operation we need to combine in order to
+ // reduce the number of atomic operations.
+ Value *const V = I.getOperand(ValIdx);
+
+ // We need to know how many lanes are active within the wavefront, and we do
+ // this by getting the exec register, which tells us all the lanes that are
+ // active.
+ MDNode *const RegName =
+ llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec"));
+ Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName);
+ CallInst *const Exec =
+ B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata});
+ setConvergent(Exec);
+
+ // We need to know how many lanes are active within the wavefront that are
+ // below us. If we counted each lane linearly starting from 0, a lane is
+ // below us only if its associated index was less than ours. We do this by
+ // using the mbcnt intrinsic.
+ Value *const BitCast = B.CreateBitCast(Exec, VecTy);
+ Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
+ Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+ CallInst *const PartialMbcnt = B.CreateIntrinsic(
+ Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
+ CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
+ {ExtractHi, PartialMbcnt});
+
+ Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);
+
+ Value *LaneOffset = nullptr;
+ Value *NewV = nullptr;
+
+ // If we have a divergent value in each lane, we need to combine the value
+ // using DPP.
+ if (ValDivergent) {
+ // First we need to set all inactive invocations to 0, so that they can
+ // correctly contribute to the final result.
+ CallInst *const SetInactive = B.CreateIntrinsic(
+ Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
+ setConvergent(SetInactive);
+ NewV = SetInactive;
+
+ const unsigned Iters = 6;
+ const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
+ DPP_ROW_SR4, DPP_ROW_SR8,
+ DPP_ROW_BCAST15, DPP_ROW_BCAST31};
+ const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
+
+ // This loop performs an inclusive scan across the wavefront, with all lanes
+ // active (by using the WWM intrinsic).
+ for (unsigned Idx = 0; Idx < Iters; Idx++) {
+ CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
+ {NewV, B.getInt32(DPPCtrl[Idx]),
+ B.getInt32(RowMask[Idx]),
+ B.getInt32(0xf), B.getFalse()});
+ setConvergent(DPP);
+ Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+
+ NewV = B.CreateBinOp(Op, NewV, WWM);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
+ }
+
+ // NewV has returned the inclusive scan of V, but for the lane offset we
+ // require an exclusive scan. We do this by shifting the values from the
+ // entire wavefront right by 1, and by setting the bound_ctrl (last argument
+ // to the intrinsic below) to true, we can guarantee that 0 will be shifted
+ // into the 0'th invocation.
+ CallInst *const DPP =
+ B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
+ {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
+ B.getInt32(0xf), B.getTrue()});
+ setConvergent(DPP);
+ LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);
+
+ // Read the value from the last lane, which has accumlated the values of
+ // each active lane in the wavefront. This will be our new value with which
+ // we will provide to the atomic operation.
+ if (TyBitWidth == 64) {
+ Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
+ Value *const ExtractHi =
+ B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
+ CallInst *const ReadLaneLo = B.CreateIntrinsic(
+ Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
+ setConvergent(ReadLaneLo);
+ CallInst *const ReadLaneHi = B.CreateIntrinsic(
+ Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
+ setConvergent(ReadLaneHi);
+ Value *const PartialInsert = B.CreateInsertElement(
+ UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
+ Value *const Insert =
+ B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
+ NewV = B.CreateBitCast(Insert, Ty);
+ } else if (TyBitWidth == 32) {
+ CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
+ {}, {NewV, B.getInt32(63)});
+ setConvergent(ReadLane);
+ NewV = ReadLane;
+ } else {
+ llvm_unreachable("Unhandled atomic bit width");
+ }
+ } else {
+ // Get the total number of active lanes we have by using popcount.
+ Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec);
+ Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);
+
+ // Calculate the new value we will be contributing to the atomic operation
+ // for the entire wavefront.
+ NewV = B.CreateMul(V, CtpopCast);
+ LaneOffset = B.CreateMul(V, MbcntCast);
+ }
+
+ // We only want a single lane to enter our new control flow, and we do this
+ // by checking if there are any active lanes below us. Only one lane will
+ // have 0 active lanes below us, so that will be the only one to progress.
+ Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));
+
+ // Store I's original basic block before we split the block.
+ BasicBlock *const EntryBB = I.getParent();
+
+ // We need to introduce some new control flow to force a single lane to be
+ // active. We do this by splitting I's basic block at I, and introducing the
+ // new block such that:
+ // entry --> single_lane -\
+ // \------------------> exit
+ Instruction *const SingleLaneTerminator =
+ SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+
+ // Move the IR builder into single_lane next.
+ B.SetInsertPoint(SingleLaneTerminator);
+
+ // Clone the original atomic operation into single lane, replacing the
+ // original value with our newly created one.
+ Instruction *const NewI = I.clone();
+ B.Insert(NewI);
+ NewI->setOperand(ValIdx, NewV);
+
+ // Move the IR builder into exit next, and start inserting just before the
+ // original instruction.
+ B.SetInsertPoint(&I);
+
+ // Create a PHI node to get our new atomic result into the exit block.
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(UndefValue::get(Ty), EntryBB);
+ PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
+
+ // We need to broadcast the value who was the lowest active lane (the first
+ // lane) to all other lanes in the wavefront. We use an intrinsic for this,
+ // but have to handle 64-bit broadcasts with two calls to this intrinsic.
+ Value *BroadcastI = nullptr;
+
+ if (TyBitWidth == 64) {
+ Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
+ Value *const ExtractHi =
+ B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
+ CallInst *const ReadFirstLaneLo =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+ setConvergent(ReadFirstLaneLo);
+ CallInst *const ReadFirstLaneHi =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+ setConvergent(ReadFirstLaneHi);
+ Value *const PartialInsert = B.CreateInsertElement(
+ UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
+ Value *const Insert =
+ B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
+ BroadcastI = B.CreateBitCast(Insert, Ty);
+ } else if (TyBitWidth == 32) {
+ CallInst *const ReadFirstLane =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
+ setConvergent(ReadFirstLane);
+ BroadcastI = ReadFirstLane;
+ } else {
+ llvm_unreachable("Unhandled atomic bit width");
+ }
+
+ // Now that we have the result of our single atomic operation, we need to
+ // get our individual lane's slice into the result. We use the lane offset we
+ // previously calculated combined with the atomic result value we got from the
+ // first lane, to get our lane's index into the atomic result.
+ Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
+
+ if (IsPixelShader) {
+ // Need a final PHI to reconverge to above the helper lane branch mask.
+ B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+ PHI->addIncoming(Result, I.getParent());
+ I.replaceAllUsesWith(PHI);
+ } else {
+ // Replace the original atomic instruction with the new one.
+ I.replaceAllUsesWith(Result);
+ }
+
+ // And delete the original.
+ I.eraseFromParent();
+}
+
+void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
+ CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
+ "AMDGPU atomic optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
+ "AMDGPU atomic optimizations", false, false)
+
+FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
+ return new AMDGPUAtomicOptimizer();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 18c7df0d94f21..daef37f9c21fb 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -28,11 +28,12 @@
using namespace llvm;
AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
- : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) {
+ : CallLowering(&TLI) {
}
bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
- const Value *Val, unsigned VReg) const {
+ const Value *Val,
+ ArrayRef<unsigned> VRegs) const {
// FIXME: Add support for non-void returns.
if (Val)
return false;
@@ -50,7 +51,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
- PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+ PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
LLT PtrType = getLLTForType(*PtrTy, DL);
unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
unsigned KernArgSegmentPtr =
@@ -72,7 +73,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
- PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+ PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index f51cb6abbf65c..ed859716218ef 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -23,8 +23,6 @@ namespace llvm {
class AMDGPUTargetLowering;
class AMDGPUCallLowering: public CallLowering {
- AMDGPUAS AMDGPUASI;
-
unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
uint64_t Offset) const;
@@ -35,8 +33,8 @@ class AMDGPUCallLowering: public CallLowering {
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
- bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
- unsigned VReg) const override;
+ bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ ArrayRef<unsigned> VRegs) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<unsigned> VRegs) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 68bc7fdd99618..367f120b5fa6b 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -19,7 +19,7 @@ class CCIfExtend<CCAction A>
// Calling convention for SI
def CC_SI : CallingConv<[
- CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -33,7 +33,7 @@ def CC_SI : CallingConv<[
CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
- CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -64,7 +64,7 @@ def RetCC_SI_Shader : CallingConv<[
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
- CCIfType<[f32, f16] , CCAssignToReg<[
+ CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 5713b7b7f9a84..4dc1e67c573d3 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -18,7 +18,7 @@
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
@@ -60,10 +60,9 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNSubtarget *ST = nullptr;
AssumptionCache *AC = nullptr;
- DivergenceAnalysis *DA = nullptr;
+ LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
bool HasUnsafeFPMath = false;
- AMDGPUAS AMDGPUASI;
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
@@ -177,7 +176,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
AU.setPreservesAll();
}
};
@@ -559,7 +558,7 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Value *FQM = Builder.CreateFMul(FA, RCP);
// fq = trunc(fqm);
- CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM });
+ CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
FQ->copyFastMathFlags(Builder.getFastMathFlags());
// float fqneg = -fq;
@@ -567,17 +566,17 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
// float fr = mad(fqneg, fb, fa);
Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
- { FQNeg, FB, FA }, FQ);
+ {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
// int iq = (int)fq;
Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
: Builder.CreateFPToUI(FQ, I32Ty);
// fr = fabs(fr);
- FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ);
+ FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
// fb = fabs(fb);
- FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ);
+ FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
// int cv = fr >= fb;
Value *CV = Builder.CreateFCmpOGE(FR, FB);
@@ -799,8 +798,8 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
if (!WidenLoads)
return false;
- if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
- I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+ if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
canWidenScalarExtLoad(I)) {
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
@@ -898,9 +897,8 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
ST = &TM.getSubtarget<GCNSubtarget>(F);
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- DA = &getAnalysis<DivergenceAnalysis>();
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
HasUnsafeFPMath = hasUnsafeFPMath(F);
- AMDGPUASI = TM.getAMDGPUAS();
bool MadeChange = false;
@@ -918,7 +916,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td
index b375cae9018ea..3c7d8a8fc5509 100644
--- a/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -19,6 +19,15 @@ def FeatureFMA : SubtargetFeature<"fmaf",
"Enable single precision FMA (not as fast as mul+add, but fused)"
>;
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+ "FP32Denormals",
+ "true",
+ "Enable single precision denormal handling"
+>;
+
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
"localmemorysize"#Value,
"LocalMemorySize",
diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
new file mode 100644
index 0000000000000..6e2a981d33968
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -0,0 +1,63 @@
+//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Promote indirect (bitcast) calls to direct calls when they are statically
+/// known to be direct. Required when InstCombine is not run (e.g. at OptNone)
+/// because AMDGPU does not support indirect calls.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-fix-function-bitcasts"
+
+namespace {
+class AMDGPUFixFunctionBitcasts final
+ : public ModulePass,
+ public InstVisitor<AMDGPUFixFunctionBitcasts> {
+
+ bool runOnModule(Module &M) override;
+
+ bool Modified;
+
+public:
+ void visitCallSite(CallSite CS) {
+ if (CS.getCalledFunction())
+ return;
+ auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+ if (Callee && isLegalToPromote(CS, Callee)) {
+ promoteCall(CS, Callee);
+ Modified = true;
+ }
+ }
+
+ static char ID;
+ AMDGPUFixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char AMDGPUFixFunctionBitcasts::ID = 0;
+char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID;
+INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE,
+ "Fix function bitcasts for AMDGPU", false, false)
+
+ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() {
+ return new AMDGPUFixFunctionBitcasts();
+}
+
+bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) {
+ Modified = false;
+ visit(M);
+ return Modified;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index ba735390f6791..59bb2a16e0f34 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -122,15 +122,14 @@ def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>;
}
def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>;
-// FIXME: Select directly to _e32 so we don't need to deal with modifiers.
// FIXME: We can't re-use SelectionDAG patterns here because they match
// against a custom SDNode and we would need to create a generic machine
// instruction that is equivalent to the custom SDNode. This would also require
// us to custom legalize the intrinsic to the new generic machine instruction,
// but I can't get custom legalizing of intrinsic to work and I'm not sure if
// this is even supported yet.
-defm : GISelVop2IntrPat <
- int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>;
+def : GISelVop3Pat2ModsPat <
+ int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>;
defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 3a58c6c6a29fe..6eab59ab4e09b 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -16,34 +16,38 @@ namespace AMDGPU {
enum PartialMappingIdx {
None = - 1,
- PM_SGPR1 = 0,
- PM_SGPR16 = 4,
- PM_SGPR32 = 5,
- PM_SGPR64 = 6,
- PM_SGPR128 = 7,
- PM_SGPR256 = 8,
- PM_SGPR512 = 9,
- PM_VGPR1 = 10,
- PM_VGPR16 = 14,
- PM_VGPR32 = 15,
- PM_VGPR64 = 16,
- PM_VGPR128 = 17,
- PM_VGPR256 = 18,
- PM_VGPR512 = 19,
- PM_SGPR96 = 20,
- PM_VGPR96 = 21
+ PM_SGPR1 = 2,
+ PM_SGPR16 = 6,
+ PM_SGPR32 = 7,
+ PM_SGPR64 = 8,
+ PM_SGPR128 = 9,
+ PM_SGPR256 = 10,
+ PM_SGPR512 = 11,
+ PM_VGPR1 = 12,
+ PM_VGPR16 = 16,
+ PM_VGPR32 = 17,
+ PM_VGPR64 = 18,
+ PM_VGPR128 = 19,
+ PM_VGPR256 = 20,
+ PM_VGPR512 = 21,
+ PM_SGPR96 = 22,
+ PM_VGPR96 = 23
};
const RegisterBankInfo::PartialMapping PartMappings[] {
// StartIdx, Length, RegBank
{0, 1, SCCRegBank},
+ {0, 1, VCCRegBank},
+
+ {0, 1, SGPRRegBank}, // SGPR begin
{0, 16, SGPRRegBank},
{0, 32, SGPRRegBank},
{0, 64, SGPRRegBank},
{0, 128, SGPRRegBank},
{0, 256, SGPRRegBank},
{0, 512, SGPRRegBank},
- {0, 1, SGPRRegBank},
+
+ {0, 1, VGPRRegBank}, // VGPR begin
{0, 16, VGPRRegBank},
{0, 32, VGPRRegBank},
{0, 64, VGPRRegBank},
@@ -55,33 +59,43 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
};
const RegisterBankInfo::ValueMapping ValMappings[] {
+ // SCC
{&PartMappings[0], 1},
- {nullptr, 0},
- {nullptr, 0},
- {nullptr, 0},
+
+ // VCC
{&PartMappings[1], 1},
+
+ // SGPRs
{&PartMappings[2], 1},
+ {nullptr, 0}, // Illegal power of 2 sizes
+ {nullptr, 0},
+ {nullptr, 0},
{&PartMappings[3], 1},
{&PartMappings[4], 1},
{&PartMappings[5], 1},
{&PartMappings[6], 1},
{&PartMappings[7], 1},
+ {&PartMappings[8], 1},
+
+ // VGPRs
+ {&PartMappings[9], 1},
{nullptr, 0},
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[8], 1},
- {&PartMappings[9], 1},
{&PartMappings[10], 1},
{&PartMappings[11], 1},
{&PartMappings[12], 1},
{&PartMappings[13], 1},
{&PartMappings[14], 1},
- {&PartMappings[15], 1}
+ {&PartMappings[15], 1},
+ {&PartMappings[16], 1},
+ {&PartMappings[17], 1}
};
enum ValueMappingIdx {
- SGPRStartIdx = 0,
- VGPRStartIdx = 10
+ SCCStartIdx = 0,
+ SGPRStartIdx = 2,
+ VGPRStartIdx = 12
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
@@ -89,16 +103,28 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
unsigned Idx;
switch (Size) {
case 1:
- Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1;
+ if (BankID == AMDGPU::SCCRegBankID)
+ return &ValMappings[0];
+ if (BankID == AMDGPU::VCCRegBankID)
+ return &ValMappings[1];
+
+ // 1-bit values not from a compare etc.
+ Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR1 : PM_VGPR1;
break;
case 96:
+ assert(BankID != AMDGPU::VCCRegBankID);
Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
break;
default:
+ assert(BankID != AMDGPU::VCCRegBankID);
Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
Idx += Log2_32_Ceil(Size);
break;
}
+
+ assert(Log2_32_Ceil(Size) == Log2_32_Ceil(ValMappings[Idx].BreakDown->Length));
+ assert(BankID == ValMappings[Idx].BreakDown->RegBank->getID());
+
return &ValMappings[Idx];
}
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 01ef346f74ee8..c38b0e61558b3 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -16,6 +16,7 @@
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -36,11 +37,14 @@ static cl::opt<bool> VerifyHSAMetadata(
namespace AMDGPU {
namespace HSAMD {
-void MetadataStreamer::dump(StringRef HSAMetadataString) const {
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV2
+//===----------------------------------------------------------------------===//
+void MetadataStreamerV2::dump(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
}
-void MetadataStreamer::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
HSAMD::Metadata FromHSAMetadataString;
@@ -63,7 +67,8 @@ void MetadataStreamer::verify(StringRef HSAMetadataString) const {
}
}
-AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+AccessQualifier
+MetadataStreamerV2::getAccessQualifier(StringRef AccQual) const {
if (AccQual.empty())
return AccessQualifier::Unknown;
@@ -74,26 +79,29 @@ AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
.Default(AccessQualifier::Default);
}
-AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+AddressSpaceQualifier
+MetadataStreamerV2::getAddressSpaceQualifier(
unsigned AddressSpace) const {
- if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
+ switch (AddressSpace) {
+ case AMDGPUAS::PRIVATE_ADDRESS:
return AddressSpaceQualifier::Private;
- if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS)
+ case AMDGPUAS::GLOBAL_ADDRESS:
return AddressSpaceQualifier::Global;
- if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS)
+ case AMDGPUAS::CONSTANT_ADDRESS:
return AddressSpaceQualifier::Constant;
- if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS)
+ case AMDGPUAS::LOCAL_ADDRESS:
return AddressSpaceQualifier::Local;
- if (AddressSpace == AMDGPUASI.FLAT_ADDRESS)
+ case AMDGPUAS::FLAT_ADDRESS:
return AddressSpaceQualifier::Generic;
- if (AddressSpace == AMDGPUASI.REGION_ADDRESS)
+ case AMDGPUAS::REGION_ADDRESS:
return AddressSpaceQualifier::Region;
-
- llvm_unreachable("Unknown address space qualifier");
+ default:
+ return AddressSpaceQualifier::Unknown;
+ }
}
-ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
- StringRef BaseTypeName) const {
+ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
+ StringRef BaseTypeName) const {
if (TypeQual.find("pipe") != StringRef::npos)
return ValueKind::Pipe;
@@ -114,13 +122,13 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
.Case("queue_t", ValueKind::Queue)
.Default(isa<PointerType>(Ty) ?
(Ty->getPointerAddressSpace() ==
- AMDGPUASI.LOCAL_ADDRESS ?
+ AMDGPUAS::LOCAL_ADDRESS ?
ValueKind::DynamicSharedPointer :
ValueKind::GlobalBuffer) :
ValueKind::ByValue);
}
-ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const {
switch (Ty->getTypeID()) {
case Type::IntegerTyID: {
auto Signed = !TypeName.startswith("u");
@@ -152,7 +160,7 @@ ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
}
}
-std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
switch (Ty->getTypeID()) {
case Type::IntegerTyID: {
if (!Signed)
@@ -189,8 +197,8 @@ std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
}
}
-std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
- MDNode *Node) const {
+std::vector<uint32_t>
+MetadataStreamerV2::getWorkGroupDimensions(MDNode *Node) const {
std::vector<uint32_t> Dims;
if (Node->getNumOperands() != 3)
return Dims;
@@ -200,9 +208,9 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
return Dims;
}
-Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const {
+Kernel::CodeProps::Metadata
+MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
@@ -229,9 +237,9 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
return HSACodeProps;
}
-Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const {
+Kernel::DebugProps::Metadata
+MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
HSAMD::Kernel::DebugProps::Metadata HSADebugProps;
@@ -251,14 +259,14 @@ Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps(
return HSADebugProps;
}
-void MetadataStreamer::emitVersion() {
+void MetadataStreamerV2::emitVersion() {
auto &Version = HSAMetadata.mVersion;
Version.push_back(VersionMajor);
Version.push_back(VersionMinor);
}
-void MetadataStreamer::emitPrintf(const Module &Mod) {
+void MetadataStreamerV2::emitPrintf(const Module &Mod) {
auto &Printf = HSAMetadata.mPrintf;
auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
@@ -270,7 +278,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) {
Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
}
-void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+void MetadataStreamerV2::emitKernelLanguage(const Function &Func) {
auto &Kernel = HSAMetadata.mKernels.back();
// TODO: What about other languages?
@@ -288,7 +296,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) {
mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
}
-void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
auto &Attrs = HSAMetadata.mKernels.back().mAttrs;
if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -306,14 +314,14 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
}
}
-void MetadataStreamer::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
for (auto &Arg : Func.args())
emitKernelArg(Arg);
emitHiddenKernelArgs(Func);
}
-void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
auto Func = Arg.getParent();
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
@@ -355,7 +363,7 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
unsigned PointeeAlign = 0;
if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
- if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+ if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
PointeeAlign = Arg.getParamAlignment();
if (PointeeAlign == 0)
PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
@@ -366,12 +374,12 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) {
PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
}
-void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
- ValueKind ValueKind,
- unsigned PointeeAlign,
- StringRef Name,
- StringRef TypeName, StringRef BaseTypeName,
- StringRef AccQual, StringRef TypeQual) {
+void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
+ ValueKind ValueKind,
+ unsigned PointeeAlign, StringRef Name,
+ StringRef TypeName,
+ StringRef BaseTypeName,
+ StringRef AccQual, StringRef TypeQual) {
HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
@@ -384,7 +392,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
Arg.mPointeeAlign = PointeeAlign;
if (auto PtrTy = dyn_cast<PointerType>(Ty))
- Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+ Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace());
Arg.mAccQual = getAccessQualifier(AccQual);
@@ -404,7 +412,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
}
}
-void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
int HiddenArgNumBytes =
getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
@@ -422,7 +430,7 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
- AMDGPUASI.GLOBAL_ADDRESS);
+ AMDGPUAS::GLOBAL_ADDRESS);
// Emit "printf buffer" argument if printf is used, otherwise emit dummy
// "none" argument.
@@ -446,13 +454,16 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) {
}
}
-void MetadataStreamer::begin(const Module &Mod) {
- AMDGPUASI = getAMDGPUAS(Mod);
+bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+ return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
+}
+
+void MetadataStreamerV2::begin(const Module &Mod) {
emitVersion();
emitPrintf(Mod);
}
-void MetadataStreamer::end() {
+void MetadataStreamerV2::end() {
std::string HSAMetadataString;
if (toString(HSAMetadata, HSAMetadataString))
return;
@@ -463,7 +474,8 @@ void MetadataStreamer::end() {
verify(HSAMetadataString);
}
-void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) {
+void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
return;
@@ -483,6 +495,505 @@ void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo
HSAMetadata.mKernels.back().mDebugProps = DebugProps;
}
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV3
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV3::dump(StringRef HSAMetadataString) const {
+ errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
+}
+
+void MetadataStreamerV3::verify(StringRef HSAMetadataString) const {
+ errs() << "AMDGPU HSA Metadata Parser Test: ";
+
+ std::shared_ptr<msgpack::Node> FromHSAMetadataString =
+ std::make_shared<msgpack::MapNode>();
+
+ yaml::Input YIn(HSAMetadataString);
+ YIn >> FromHSAMetadataString;
+ if (YIn.error()) {
+ errs() << "FAIL\n";
+ return;
+ }
+
+ std::string ToHSAMetadataString;
+ raw_string_ostream StrOS(ToHSAMetadataString);
+ yaml::Output YOut(StrOS);
+ YOut << FromHSAMetadataString;
+
+ errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n';
+ if (HSAMetadataString != ToHSAMetadataString) {
+ errs() << "Original input: " << HSAMetadataString << '\n'
+ << "Produced output: " << StrOS.str() << '\n';
+ }
+}
+
+Optional<StringRef>
+MetadataStreamerV3::getAccessQualifier(StringRef AccQual) const {
+ return StringSwitch<Optional<StringRef>>(AccQual)
+ .Case("read_only", StringRef("read_only"))
+ .Case("write_only", StringRef("write_only"))
+ .Case("read_write", StringRef("read_write"))
+ .Default(None);
+}
+
+Optional<StringRef>
+MetadataStreamerV3::getAddressSpaceQualifier(unsigned AddressSpace) const {
+ switch (AddressSpace) {
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ return StringRef("private");
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ return StringRef("global");
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ return StringRef("constant");
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return StringRef("local");
+ case AMDGPUAS::FLAT_ADDRESS:
+ return StringRef("generic");
+ case AMDGPUAS::REGION_ADDRESS:
+ return StringRef("region");
+ default:
+ return None;
+ }
+}
+
+StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
+ StringRef BaseTypeName) const {
+ if (TypeQual.find("pipe") != StringRef::npos)
+ return "pipe";
+
+ return StringSwitch<StringRef>(BaseTypeName)
+ .Case("image1d_t", "image")
+ .Case("image1d_array_t", "image")
+ .Case("image1d_buffer_t", "image")
+ .Case("image2d_t", "image")
+ .Case("image2d_array_t", "image")
+ .Case("image2d_array_depth_t", "image")
+ .Case("image2d_array_msaa_t", "image")
+ .Case("image2d_array_msaa_depth_t", "image")
+ .Case("image2d_depth_t", "image")
+ .Case("image2d_msaa_t", "image")
+ .Case("image2d_msaa_depth_t", "image")
+ .Case("image3d_t", "image")
+ .Case("sampler_t", "sampler")
+ .Case("queue_t", "queue")
+ .Default(isa<PointerType>(Ty)
+ ? (Ty->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
+ ? "dynamic_shared_pointer"
+ : "global_buffer")
+ : "by_value");
+}
+
+StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const {
+ switch (Ty->getTypeID()) {
+ case Type::IntegerTyID: {
+ auto Signed = !TypeName.startswith("u");
+ switch (Ty->getIntegerBitWidth()) {
+ case 8:
+ return Signed ? "i8" : "u8";
+ case 16:
+ return Signed ? "i16" : "u16";
+ case 32:
+ return Signed ? "i32" : "u32";
+ case 64:
+ return Signed ? "i64" : "u64";
+ default:
+ return "struct";
+ }
+ }
+ case Type::HalfTyID:
+ return "f16";
+ case Type::FloatTyID:
+ return "f32";
+ case Type::DoubleTyID:
+ return "f64";
+ case Type::PointerTyID:
+ return getValueType(Ty->getPointerElementType(), TypeName);
+ case Type::VectorTyID:
+ return getValueType(Ty->getVectorElementType(), TypeName);
+ default:
+ return "struct";
+ }
+}
+
+std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
+ switch (Ty->getTypeID()) {
+ case Type::IntegerTyID: {
+ if (!Signed)
+ return (Twine('u') + getTypeName(Ty, true)).str();
+
+ auto BitWidth = Ty->getIntegerBitWidth();
+ switch (BitWidth) {
+ case 8:
+ return "char";
+ case 16:
+ return "short";
+ case 32:
+ return "int";
+ case 64:
+ return "long";
+ default:
+ return (Twine('i') + Twine(BitWidth)).str();
+ }
+ }
+ case Type::HalfTyID:
+ return "half";
+ case Type::FloatTyID:
+ return "float";
+ case Type::DoubleTyID:
+ return "double";
+ case Type::VectorTyID: {
+ auto VecTy = cast<VectorType>(Ty);
+ auto ElTy = VecTy->getElementType();
+ auto NumElements = VecTy->getVectorNumElements();
+ return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+ }
+ default:
+ return "unknown";
+ }
+}
+
+std::shared_ptr<msgpack::ArrayNode>
+MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const {
+ auto Dims = std::make_shared<msgpack::ArrayNode>();
+ if (Node->getNumOperands() != 3)
+ return Dims;
+
+ for (auto &Op : Node->operands())
+ Dims->push_back(std::make_shared<msgpack::ScalarNode>(
+ mdconst::extract<ConstantInt>(Op)->getZExtValue()));
+ return Dims;
+}
+
+void MetadataStreamerV3::emitVersion() {
+ auto Version = std::make_shared<msgpack::ArrayNode>();
+ Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor));
+ Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor));
+ getRootMetadata("amdhsa.version") = std::move(Version);
+}
+
+void MetadataStreamerV3::emitPrintf(const Module &Mod) {
+ auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+ if (!Node)
+ return;
+
+ auto Printf = std::make_shared<msgpack::ArrayNode>();
+ for (auto Op : Node->operands())
+ if (Op->getNumOperands())
+ Printf->push_back(std::make_shared<msgpack::ScalarNode>(
+ cast<MDString>(Op->getOperand(0))->getString()));
+ getRootMetadata("amdhsa.printf") = std::move(Printf);
+}
+
+void MetadataStreamerV3::emitKernelLanguage(const Function &Func,
+ msgpack::MapNode &Kern) {
+ // TODO: What about other languages?
+ auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+ if (!Node || !Node->getNumOperands())
+ return;
+ auto Op0 = Node->getOperand(0);
+ if (Op0->getNumOperands() <= 1)
+ return;
+
+ Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C");
+ auto LanguageVersion = std::make_shared<msgpack::ArrayNode>();
+ LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+ mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
+ LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>(
+ mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
+ Kern[".language_version"] = std::move(LanguageVersion);
+}
+
+void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
+ msgpack::MapNode &Kern) {
+
+ if (auto Node = Func.getMetadata("reqd_work_group_size"))
+ Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
+ if (auto Node = Func.getMetadata("work_group_size_hint"))
+ Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
+ if (auto Node = Func.getMetadata("vec_type_hint")) {
+ Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName(
+ cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+ mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()));
+ }
+ if (Func.hasFnAttribute("runtime-handle")) {
+ Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>(
+ Func.getFnAttribute("runtime-handle").getValueAsString().str());
+ }
+}
+
+void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+ msgpack::MapNode &Kern) {
+ unsigned Offset = 0;
+ auto Args = std::make_shared<msgpack::ArrayNode>();
+ for (auto &Arg : Func.args())
+ emitKernelArg(Arg, Offset, *Args);
+
+ emitHiddenKernelArgs(Func, Offset, *Args);
+
+ // TODO: What about other languages?
+ if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) {
+ auto &DL = Func.getParent()->getDataLayout();
+ auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+ emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args);
+ emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args);
+ emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args);
+
+ auto Int8PtrTy =
+ Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+ // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+ // "none" argument.
+ if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+ emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args);
+ else
+ emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+
+ // Emit "default queue" and "completion action" arguments if enqueue kernel
+ // is used, otherwise emit dummy "none" arguments.
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args);
+ emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args);
+ } else {
+ emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+ emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args);
+ }
+ }
+
+ Kern[".args"] = std::move(Args);
+}
+
+void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
+ msgpack::ArrayNode &Args) {
+ auto Func = Arg.getParent();
+ auto ArgNo = Arg.getArgNo();
+ const MDNode *Node;
+
+ StringRef Name;
+ Node = Func->getMetadata("kernel_arg_name");
+ if (Node && ArgNo < Node->getNumOperands())
+ Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+ else if (Arg.hasName())
+ Name = Arg.getName();
+
+ StringRef TypeName;
+ Node = Func->getMetadata("kernel_arg_type");
+ if (Node && ArgNo < Node->getNumOperands())
+ TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+ StringRef BaseTypeName;
+ Node = Func->getMetadata("kernel_arg_base_type");
+ if (Node && ArgNo < Node->getNumOperands())
+ BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+ StringRef AccQual;
+ if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
+ Arg.hasNoAliasAttr()) {
+ AccQual = "read_only";
+ } else {
+ Node = Func->getMetadata("kernel_arg_access_qual");
+ if (Node && ArgNo < Node->getNumOperands())
+ AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+ }
+
+ StringRef TypeQual;
+ Node = Func->getMetadata("kernel_arg_type_qual");
+ if (Node && ArgNo < Node->getNumOperands())
+ TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+ Type *Ty = Arg.getType();
+ const DataLayout &DL = Func->getParent()->getDataLayout();
+
+ unsigned PointeeAlign = 0;
+ if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+ if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ PointeeAlign = Arg.getParamAlignment();
+ if (PointeeAlign == 0)
+ PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+ }
+ }
+
+ emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+ getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset,
+ Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual,
+ TypeQual);
+}
+
+void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
+ StringRef ValueKind, unsigned &Offset,
+ msgpack::ArrayNode &Args,
+ unsigned PointeeAlign, StringRef Name,
+ StringRef TypeName,
+ StringRef BaseTypeName,
+ StringRef AccQual, StringRef TypeQual) {
+ auto ArgPtr = std::make_shared<msgpack::MapNode>();
+ auto &Arg = *ArgPtr;
+
+ if (!Name.empty())
+ Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name);
+ if (!TypeName.empty())
+ Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName);
+ auto Size = DL.getTypeAllocSize(Ty);
+ auto Align = DL.getABITypeAlignment(Ty);
+ Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size);
+ Offset = alignTo(Offset, Align);
+ Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset);
+ Offset += Size;
+ Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind);
+ Arg[".value_type"] =
+ std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName));
+ if (PointeeAlign)
+ Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign);
+
+ if (auto PtrTy = dyn_cast<PointerType>(Ty))
+ if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
+ Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier);
+
+ if (auto AQ = getAccessQualifier(AccQual))
+ Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ);
+
+ // TODO: Emit Arg[".actual_access"].
+
+ SmallVector<StringRef, 1> SplitTypeQuals;
+ TypeQual.split(SplitTypeQuals, " ", -1, false);
+ for (StringRef Key : SplitTypeQuals) {
+ if (Key == "const")
+ Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true);
+ else if (Key == "restrict")
+ Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true);
+ else if (Key == "volatile")
+ Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true);
+ else if (Key == "pipe")
+ Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true);
+ }
+
+ Args.push_back(std::move(ArgPtr));
+}
+
+void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
+ unsigned &Offset,
+ msgpack::ArrayNode &Args) {
+ int HiddenArgNumBytes =
+ getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
+
+ if (!HiddenArgNumBytes)
+ return;
+
+ auto &DL = Func.getParent()->getDataLayout();
+ auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+ if (HiddenArgNumBytes >= 8)
+ emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args);
+ if (HiddenArgNumBytes >= 16)
+ emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args);
+ if (HiddenArgNumBytes >= 24)
+ emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args);
+
+ auto Int8PtrTy =
+ Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+ // Emit "printf buffer" argument if printf is used, otherwise emit dummy
+ // "none" argument.
+ if (HiddenArgNumBytes >= 32) {
+ if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+ emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args);
+ else
+ emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+ }
+
+ // Emit "default queue" and "completion action" arguments if enqueue kernel is
+ // used, otherwise emit dummy "none" arguments.
+ if (HiddenArgNumBytes >= 48) {
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args);
+ } else {
+ emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+ emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args);
+ }
+ }
+}
+
+std::shared_ptr<msgpack::MapNode>
+MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ const Function &F = MF.getFunction();
+
+ auto HSAKernelProps = std::make_shared<msgpack::MapNode>();
+ auto &Kern = *HSAKernelProps;
+
+ unsigned MaxKernArgAlign;
+ Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>(
+ STM.getKernArgSegmentSize(F, MaxKernArgAlign));
+ Kern[".group_segment_fixed_size"] =
+ std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize);
+ Kern[".private_segment_fixed_size"] =
+ std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize);
+ Kern[".kernarg_segment_align"] =
+ std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign));
+ Kern[".wavefront_size"] =
+ std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize());
+ Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR);
+ Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR);
+ Kern[".max_flat_workgroup_size"] =
+ std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize());
+ Kern[".sgpr_spill_count"] =
+ std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs());
+ Kern[".vgpr_spill_count"] =
+ std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs());
+
+ return HSAKernelProps;
+}
+
+bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+ return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true);
+}
+
+void MetadataStreamerV3::begin(const Module &Mod) {
+ emitVersion();
+ emitPrintf(Mod);
+ getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode());
+}
+
+void MetadataStreamerV3::end() {
+ std::string HSAMetadataString;
+ raw_string_ostream StrOS(HSAMetadataString);
+ yaml::Output YOut(StrOS);
+ YOut << HSAMetadataRoot;
+
+ if (DumpHSAMetadata)
+ dump(StrOS.str());
+ if (VerifyHSAMetadata)
+ verify(StrOS.str());
+}
+
+void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) {
+ auto &Func = MF.getFunction();
+ auto KernelProps = getHSAKernelProps(MF, ProgramInfo);
+
+ assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+ Func.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+ auto &KernelsNode = getRootMetadata("amdhsa.kernels");
+ auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get());
+
+ {
+ auto &Kern = *KernelProps;
+ Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName());
+ Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>(
+ (Twine(Func.getName()) + Twine(".kd")).str());
+ emitKernelLanguage(Func, Kern);
+ emitKernelAttrs(Func, Kern);
+ emitKernelArgs(Func, Kern);
+ }
+
+ Kernels->push_back(std::move(KernelProps));
+}
+
} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 3424c956d7816..afc09baf952d6 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -19,10 +19,12 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
#include "llvm/Support/AMDGPUMetadata.h"
namespace llvm {
+class AMDGPUTargetStreamer;
class Argument;
class DataLayout;
class Function;
@@ -34,10 +36,94 @@ class Type;
namespace AMDGPU {
namespace HSAMD {
-class MetadataStreamer final {
+class MetadataStreamer {
+public:
+ virtual ~MetadataStreamer(){};
+
+ virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0;
+
+ virtual void begin(const Module &Mod) = 0;
+
+ virtual void end() = 0;
+
+ virtual void emitKernel(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) = 0;
+};
+
+class MetadataStreamerV3 final : public MetadataStreamer {
+private:
+ std::shared_ptr<msgpack::Node> HSAMetadataRoot =
+ std::make_shared<msgpack::MapNode>();
+
+ void dump(StringRef HSAMetadataString) const;
+
+ void verify(StringRef HSAMetadataString) const;
+
+ Optional<StringRef> getAccessQualifier(StringRef AccQual) const;
+
+ Optional<StringRef> getAddressSpaceQualifier(unsigned AddressSpace) const;
+
+ StringRef getValueKind(Type *Ty, StringRef TypeQual,
+ StringRef BaseTypeName) const;
+
+ StringRef getValueType(Type *Ty, StringRef TypeName) const;
+
+ std::string getTypeName(Type *Ty, bool Signed) const;
+
+ std::shared_ptr<msgpack::ArrayNode>
+ getWorkGroupDimensions(MDNode *Node) const;
+
+ std::shared_ptr<msgpack::MapNode>
+ getHSAKernelProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) const;
+
+ void emitVersion();
+
+ void emitPrintf(const Module &Mod);
+
+ void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern);
+
+ void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern);
+
+ void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern);
+
+ void emitKernelArg(const Argument &Arg, unsigned &Offset,
+ msgpack::ArrayNode &Args);
+
+ void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
+ unsigned &Offset, msgpack::ArrayNode &Args,
+ unsigned PointeeAlign = 0, StringRef Name = "",
+ StringRef TypeName = "", StringRef BaseTypeName = "",
+ StringRef AccQual = "", StringRef TypeQual = "");
+
+ void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
+ msgpack::ArrayNode &Args);
+
+ std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) {
+ return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key];
+ }
+
+ std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() {
+ return HSAMetadataRoot;
+ }
+
+public:
+ MetadataStreamerV3() = default;
+ ~MetadataStreamerV3() = default;
+
+ bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+ void begin(const Module &Mod) override;
+
+ void end() override;
+
+ void emitKernel(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) override;
+};
+
+class MetadataStreamerV2 final : public MetadataStreamer {
private:
Metadata HSAMetadata;
- AMDGPUAS AMDGPUASI;
void dump(StringRef HSAMetadataString) const;
@@ -45,7 +131,7 @@ private:
AccessQualifier getAccessQualifier(StringRef AccQual) const;
- AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const;
+ AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const;
ValueKind getValueKind(Type *Ty, StringRef TypeQual,
StringRef BaseTypeName) const;
@@ -83,19 +169,22 @@ private:
void emitHiddenKernelArgs(const Function &Func);
-public:
- MetadataStreamer() = default;
- ~MetadataStreamer() = default;
-
const Metadata &getHSAMetadata() const {
return HSAMetadata;
}
- void begin(const Module &Mod);
+public:
+ MetadataStreamerV2() = default;
+ ~MetadataStreamerV2() = default;
+
+ bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
+
+ void begin(const Module &Mod) override;
- void end();
+ void end() override;
- void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo);
+ void emitKernel(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo) override;
};
} // end namespace HSAMD
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f25f4d4693eac..a0a045e72a58f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -29,7 +29,7 @@
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -72,14 +72,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for different targets.
const GCNSubtarget *Subtarget;
- AMDGPUAS AMDGPUASI;
bool EnableLateStructurizeCFG;
public:
explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
: SelectionDAGISel(*TM, OptLevel) {
- AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
}
~AMDGPUDAGToDAGISel() override = default;
@@ -87,7 +85,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AMDGPUArgumentUsageInfo>();
AU.addRequired<AMDGPUPerfHintAnalysis>();
- AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
SelectionDAGISel::getAnalysisUsage(AU);
}
@@ -103,9 +101,12 @@ private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;
-
+ bool isVGPRImm(const SDNode *N) const;
+ bool isUniformLoad(const SDNode *N) const;
bool isUniformBr(const SDNode *N) const;
+ MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
+
SDNode *glueCopyToM0(SDNode *N) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
@@ -140,13 +141,6 @@ private:
SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- bool SelectMUBUFConstant(SDValue Constant,
- SDValue &SOffset,
- SDValue &ImmOffset) const;
- bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
- SDValue &ImmOffset) const;
- bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
- SDValue &ImmOffset, SDValue &VOffset) const;
bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
@@ -224,7 +218,6 @@ protected:
class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
const R600Subtarget *Subtarget;
- AMDGPUAS AMDGPUASI;
bool isConstantLoad(const MemSDNode *N, int cbID) const;
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
@@ -232,9 +225,7 @@ class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
SDValue& Offset);
public:
explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
- AMDGPUDAGToDAGISel(TM, OptLevel) {
- AMDGPUASI = AMDGPU::getAMDGPUAS(*TM);
- }
+ AMDGPUDAGToDAGISel(TM, OptLevel) {}
void Select(SDNode *N) override;
@@ -251,12 +242,12 @@ protected:
} // end anonymous namespace
-INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel",
+INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
-INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel",
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
/// This pass converts a legalized DAG into a AMDGPU-specific
@@ -350,7 +341,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
- if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS ||
+ if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
!Subtarget->ldsRequiresM0Init())
return N;
@@ -372,6 +363,22 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
+MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
+ EVT VT) const {
+ SDNode *Lo = CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
+ SDNode *Hi =
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
+
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
+}
+
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
@@ -557,19 +564,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
SDLoc DL(N);
- SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
- MVT::i32));
- SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
- const SDValue Ops[] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
- SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
- N->getValueType(0), Ops));
+ ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
return;
}
case ISD::LOAD:
@@ -641,6 +636,20 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case AMDGPUISD::ATOMIC_CMP_SWAP:
SelectATOMIC_CMP_SWAP(N);
return;
+ case AMDGPUISD::CVT_PKRTZ_F16_F32:
+ case AMDGPUISD::CVT_PKNORM_I16_F32:
+ case AMDGPUISD::CVT_PKNORM_U16_F32:
+ case AMDGPUISD::CVT_PK_U16_U32:
+ case AMDGPUISD::CVT_PK_I16_I32: {
+ // Hack around using a legal type if f16 is illegal.
+ if (N->getValueType(0) == MVT::i32) {
+ MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
+ N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
+ { N->getOperand(0), N->getOperand(1) });
+ SelectCode(N);
+ return;
+ }
+ }
}
SelectCode(N);
@@ -969,8 +978,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
// default case
- // FIXME: This is broken on SI where we still need to check if the base
- // pointer is positive here.
Base = Addr;
Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
@@ -1000,55 +1007,72 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ ConstantSDNode *C1 = nullptr;
+ SDValue N0 = Addr;
if (CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ C1 = cast<ConstantSDNode>(Addr.getOperand(1));
+ if (isUInt<32>(C1->getZExtValue()))
+ N0 = Addr.getOperand(0);
+ else
+ C1 = nullptr;
+ }
- if (N0.getOpcode() == ISD::ADD) {
- // (add (add N2, N3), C1) -> addr64
- SDValue N2 = N0.getOperand(0);
- SDValue N3 = N0.getOperand(1);
- Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+ if (N0.getOpcode() == ISD::ADD) {
+ // (add N2, N3) -> addr64, or
+ // (add (add N2, N3), C1) -> addr64
+ SDValue N2 = N0.getOperand(0);
+ SDValue N3 = N0.getOperand(1);
+ Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+
+ if (N2->isDivergent()) {
+ if (N3->isDivergent()) {
+ // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
+ // addr64, and construct the resource from a 0 address.
+ Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
+ VAddr = N0;
+ } else {
+ // N2 is divergent, N3 is not.
+ Ptr = N3;
+ VAddr = N2;
+ }
+ } else {
+ // N2 is not divergent.
Ptr = N2;
VAddr = N3;
- } else {
- // (add N0, C1) -> offset
- VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Ptr = N0;
}
-
- if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
- Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
- return true;
- }
-
- if (isUInt<32>(C1->getZExtValue())) {
- // Illegal offset, store it in soffset.
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
- SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
- 0);
- return true;
- }
- }
-
- if (Addr.getOpcode() == ISD::ADD) {
- // (add N0, N1) -> addr64
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ } else if (N0->isDivergent()) {
+ // N0 is divergent. Use it as the addr64, and construct the resource from a
+ // 0 address.
+ Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
+ VAddr = N0;
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+ } else {
+ // N0 -> offset, or
+ // (N0 + C1) -> offset
+ VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = N0;
- VAddr = N1;
+ }
+
+ if (!C1) {
+ // No offset.
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;
}
- // default case -> offset
- VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Ptr = Addr;
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
+ // Legal offset for instruction.
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
+ // Illegal offset, store it in soffset.
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ SOffset =
+ SDValue(CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
+ 0);
return true;
}
@@ -1252,101 +1276,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
- SDValue &SOffset,
- SDValue &ImmOffset) const {
- SDLoc DL(Constant);
- const uint32_t Align = 4;
- const uint32_t MaxImm = alignDown(4095, Align);
- uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
- uint32_t Overflow = 0;
-
- if (Imm > MaxImm) {
- if (Imm <= MaxImm + 64) {
- // Use an SOffset inline constant for 4..64
- Overflow = Imm - MaxImm;
- Imm = MaxImm;
- } else {
- // Try to keep the same value in SOffset for adjacent loads, so that
- // the corresponding register contents can be re-used.
- //
- // Load values with all low-bits (except for alignment bits) set into
- // SOffset, so that a larger range of values can be covered using
- // s_movk_i32.
- //
- // Atomic operations fail to work correctly when individual address
- // components are unaligned, even if their sum is aligned.
- uint32_t High = (Imm + Align) & ~4095;
- uint32_t Low = (Imm + Align) & 4095;
- Imm = Low;
- Overflow = High - Align;
- }
- }
-
- // There is a hardware bug in SI and CI which prevents address clamping in
- // MUBUF instructions from working correctly with SOffsets. The immediate
- // offset is unaffected.
- if (Overflow > 0 &&
- Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
- return false;
-
- ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
-
- if (Overflow <= 64)
- SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
- else
- SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
- 0);
-
- return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
- SDValue &SOffset,
- SDValue &ImmOffset) const {
- SDLoc DL(Offset);
-
- if (!isa<ConstantSDNode>(Offset))
- return false;
-
- return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
-}
-
-bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
- SDValue &SOffset,
- SDValue &ImmOffset,
- SDValue &VOffset) const {
- SDLoc DL(Offset);
-
- // Don't generate an unnecessary voffset for constant offsets.
- if (isa<ConstantSDNode>(Offset)) {
- SDValue Tmp1, Tmp2;
-
- // When necessary, use a voffset in <= CI anyway to work around a hardware
- // bug.
- if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
- SelectMUBUFConstant(Offset, Tmp1, Tmp2))
- return false;
- }
-
- if (CurDAG->isBaseWithConstantOffset(Offset)) {
- SDValue N0 = Offset.getOperand(0);
- SDValue N1 = Offset.getOperand(1);
- if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
- SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
- VOffset = N0;
- return true;
- }
- }
-
- SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
- ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
- VOffset = Offset;
-
- return true;
-}
-
template <bool IsSigned>
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
SDValue &VAddr,
@@ -1451,7 +1380,11 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
SDValue &Offset, bool &Imm) const {
SDLoc SL(Addr);
- if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ // A 32-bit (address + offset) should not cause unsigned 32-bit integer
+ // wraparound, because s_load instructions perform the addition in 64 bits.
+ if ((Addr.getValueType() != MVT::i32 ||
+ Addr->getFlags().hasNoUnsignedWrap()) &&
+ CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
@@ -1521,9 +1454,13 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
// (add n0, c0)
- Base = N0;
- Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
- return true;
+ // Don't peel off the offset (c0) if doing so could possibly lead
+ // the base (n0) to be negative.
+ if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
+ Base = N0;
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+ return true;
+ }
}
if (isa<ConstantSDNode>(Index))
@@ -1764,7 +1701,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
MemSDNode *Mem = cast<MemSDNode>(N);
unsigned AS = Mem->getAddressSpace();
- if (AS == AMDGPUASI.FLAT_ADDRESS) {
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
SelectCode(N);
return;
}
@@ -1812,9 +1749,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
return;
}
- MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1);
- *MMOs = Mem->getMemOperand();
- CmpSwap->setMemRefs(MMOs, MMOs + 1);
+ MachineMemOperand *MMO = Mem->getMemOperand();
+ CurDAG->setNodeMemRefs(CmpSwap, {MMO});
unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
SDValue Extract
@@ -2113,6 +2049,80 @@ bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
return isExtractHiElt(In, Src);
}
+bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ return false;
+ }
+ const SIRegisterInfo *SIRI =
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ const SIInstrInfo * SII =
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+ unsigned Limit = 0;
+ bool AllUsesAcceptSReg = true;
+ for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+ Limit < 10 && U != E; ++U, ++Limit) {
+ const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+
+ // If the register class is unknown, it could be an unknown
+ // register class that needs to be an SGPR, e.g. an inline asm
+ // constraint
+ if (!RC || SIRI->isSGPRClass(RC))
+ return false;
+
+ if (RC != &AMDGPU::VS_32RegClass) {
+ AllUsesAcceptSReg = false;
+ SDNode * User = *U;
+ if (User->isMachineOpcode()) {
+ unsigned Opc = User->getMachineOpcode();
+ MCInstrDesc Desc = SII->get(Opc);
+ if (Desc.isCommutable()) {
+ unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
+ unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
+ if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
+ unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
+ const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
+ if (CommutedRC == &AMDGPU::VS_32RegClass)
+ AllUsesAcceptSReg = true;
+ }
+ }
+ }
+ // If "AllUsesAcceptSReg == false" so far we haven't suceeded
+ // commuting current user. This means have at least one use
+ // that strictly require VGPR. Thus, we will not attempt to commute
+ // other user instructions.
+ if (!AllUsesAcceptSReg)
+ break;
+ }
+ }
+ return !AllUsesAcceptSReg && (Limit < 10);
+}
+
+bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
+ auto Ld = cast<LoadSDNode>(N);
+
+ return Ld->getAlignment() >= 4 &&
+ (
+ (
+ (
+ Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
+ )
+ &&
+ !N->isDivergent()
+ )
+ ||
+ (
+ Subtarget->getScalarizeGlobalBehavior() &&
+ Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ !Ld->isVolatile() &&
+ !N->isDivergent() &&
+ static_cast<const SITargetLowering *>(
+ getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
+ )
+ );
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
@@ -2148,10 +2158,10 @@ bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
if (!N->readMem())
return false;
if (CbId == -1)
- return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
- N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+ return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
- return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
+ return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
}
bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 21e44e9589d3c..6951c915b1772 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -128,10 +128,8 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
}
unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
- KnownBits Known;
EVT VT = Op.getValueType();
- DAG.computeKnownBits(Op, Known);
-
+ KnownBits Known = DAG.computeKnownBits(Op);
return VT.getSizeInBits() - Known.countMinLeadingZeros();
}
@@ -146,7 +144,6 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
const AMDGPUSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
- AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::LOAD, MVT::f32, Promote);
@@ -318,6 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FLOG, MVT::f32, Custom);
setOperationAction(ISD::FLOG10, MVT::f32, Custom);
+ setOperationAction(ISD::FEXP, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
@@ -450,6 +448,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
+ setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
@@ -470,6 +469,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
setOperationAction(ISD::SETCC, VT, Expand);
+ setOperationAction(ISD::FCANONICALIZE, VT, Expand);
}
// This causes using an unrolled select operation rather than expansion with
@@ -550,6 +550,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case ISD::FMAD:
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
case ISD::FSIN:
case ISD::FTRUNC:
case ISD::FRINT:
@@ -562,6 +564,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY:
+ case AMDGPUISD::FMED3:
return true;
default:
return false;
@@ -650,8 +653,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
}
bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
- ISD::LoadExtType,
+ ISD::LoadExtType ExtTy,
EVT NewVT) const {
+ // TODO: This may be worth removing. Check regression tests for diffs.
+ if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
+ return false;
unsigned NewSize = NewVT.getStoreSizeInBits();
@@ -662,6 +668,18 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
EVT OldVT = N->getValueType(0);
unsigned OldSize = OldVT.getStoreSizeInBits();
+ MemSDNode *MN = cast<MemSDNode>(N);
+ unsigned AS = MN->getAddressSpace();
+ // Do not shrink an aligned scalar load to sub-dword.
+ // Scalar engine cannot do sub-dword loads.
+ if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+ (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ (isa<LoadSDNode>(N) &&
+ AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
+ AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
+ return false;
+
// Don't produce extloads from sub 32-bit types. SI doesn't have scalar
// extloads, so doing one requires using a buffer_load. In cases where we
// still couldn't use a scalar load, using the wider load shouldn't really
@@ -722,7 +740,7 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
{
const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
if (L->getMemOperand()->getAddrSpace()
- == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+ == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
return false;
}
@@ -1140,6 +1158,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
case ISD::FLOG10:
return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
+ case ISD::FEXP:
+ return lowerFEXP(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
@@ -1188,8 +1208,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
- if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
- G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+ if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isEntryFunction()) {
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
@@ -2213,6 +2233,34 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
}
+// Return M_LOG2E of appropriate type
+static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
+ switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
+ case MVT::f16:
+ return DAG.getConstantFP(
+ APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
+ SL, VT);
+ case MVT::f64:
+ return DAG.getConstantFP(
+ APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
+ default:
+ llvm_unreachable("unsupported fp type");
+ }
+}
+
+// exp2(M_LOG2E_F * f);
+SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ const SDValue K = getLog2EVal(DAG, SL, VT);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
+ return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
+}
+
static bool isCtlzOpc(unsigned Opc) {
return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
}
@@ -2669,21 +2717,33 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
}
-static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
- TargetLowering::DAGCombinerInfo &DCI) {
-
+static SDValue simplifyI24(SDNode *Node24,
+ TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- SDValue Op = Node24->getOperand(OpIdx);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT VT = Op.getValueType();
+ SDValue LHS = Node24->getOperand(0);
+ SDValue RHS = Node24->getOperand(1);
- APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
- APInt KnownZero, KnownOne;
- TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
- if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
- return true;
+ APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
- return false;
+ // First try to simplify using GetDemandedBits which allows the operands to
+ // have other uses, but will only perform simplifications that involve
+ // bypassing some nodes for this user.
+ SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
+ SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
+ if (DemandedLHS || DemandedRHS)
+ return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+ DemandedLHS ? DemandedLHS : LHS,
+ DemandedRHS ? DemandedRHS : RHS);
+
+ // Now try SimplifyDemandedBits which can simplify the nodes used by our
+ // operands if this node is the only user.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
+ return SDValue(Node24, 0);
+ if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
+ return SDValue(Node24, 0);
+
+ return SDValue();
}
template <typename IntTy>
@@ -2920,8 +2980,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
// shl (ext x) => zext (shl x), if shift does not overflow int
if (VT != MVT::i64)
break;
- KnownBits Known;
- DAG.computeKnownBits(X, Known);
+ KnownBits Known = DAG.computeKnownBits(X);
unsigned LZ = Known.countMinLeadingZeros();
if (LZ < RHSVal)
break;
@@ -3080,8 +3139,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
Src.getOpcode() == ISD::SRA ||
Src.getOpcode() == ISD::SHL)) {
SDValue Amt = Src.getOperand(1);
- KnownBits Known;
- DAG.computeKnownBits(Amt, Known);
+ KnownBits Known = DAG.computeKnownBits(Amt);
unsigned Size = VT.getScalarSizeInBits();
if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
(Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
@@ -3233,8 +3291,8 @@ SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
SelectionDAG &DAG = DCI.DAG;
// Simplify demanded bits before splitting into multiple users.
- if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
- return SDValue();
+ if (SDValue V = simplifyI24(N, DCI))
+ return V;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -3449,9 +3507,27 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
}
-static bool isConstantFPZero(SDValue N) {
- if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
- return C->isZero() && !C->isNegative();
+static bool isInv2Pi(const APFloat &APF) {
+ static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
+ static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
+ static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
+
+ return APF.bitwiseIsEqual(KF16) ||
+ APF.bitwiseIsEqual(KF32) ||
+ APF.bitwiseIsEqual(KF64);
+}
+
+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
+// additional cost to negate them.
+bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
+ if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
+ if (C->isZero() && !C->isNegative())
+ return true;
+
+ if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
+ return true;
+ }
+
return false;
}
@@ -3461,6 +3537,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return ISD::FMINNUM;
case ISD::FMINNUM:
return ISD::FMAXNUM;
+ case ISD::FMAXNUM_IEEE:
+ return ISD::FMINNUM_IEEE;
+ case ISD::FMINNUM_IEEE:
+ return ISD::FMAXNUM_IEEE;
case AMDGPUISD::FMAX_LEGACY:
return AMDGPUISD::FMIN_LEGACY;
case AMDGPUISD::FMIN_LEGACY:
@@ -3566,6 +3646,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
}
case ISD::FMAXNUM:
case ISD::FMINNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMIN_LEGACY: {
// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
@@ -3577,9 +3659,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
SDValue RHS = N0.getOperand(1);
// 0 doesn't have a negated inline immediate.
- // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
- // operations.
- if (isConstantFPZero(RHS))
+ // TODO: This constant check should be generalized to other operations.
+ if (isConstantCostlierToNegate(RHS))
return SDValue();
SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
@@ -3591,6 +3672,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
return Res;
}
+ case AMDGPUISD::FMED3: {
+ SDValue Ops[3];
+ for (unsigned I = 0; I < 3; ++I)
+ Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
+
+ SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
case ISD::FP_EXTEND:
case ISD::FTRUNC:
case ISD::FRINT:
@@ -3737,9 +3828,10 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
if (Src.getValueType() == MVT::i64) {
SDLoc SL(N);
uint64_t CVal = C->getZExtValue();
- return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
- DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
- DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
}
}
@@ -3786,9 +3878,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::MUL_U24:
case AMDGPUISD::MULHI_I24:
case AMDGPUISD::MULHI_U24: {
- // If the first call to simplify is successfull, then N may end up being
- // deleted, so we shouldn't call simplifyI24 again.
- simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
+ if (SDValue V = simplifyI24(N, DCI))
+ return V;
return SDValue();
}
case AMDGPUISD::MUL_LOHI_I24:
@@ -3943,13 +4034,12 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
- SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
- SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
+ SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
MachineMemOperand::MODereferenceable);
return Store;
@@ -4111,6 +4201,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
+ NODE_NAME_CASE(SBUFFER_LOAD)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
@@ -4210,33 +4301,42 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
}
case AMDGPUISD::MUL_U24:
case AMDGPUISD::MUL_I24: {
- KnownBits LHSKnown, RHSKnown;
- DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
- DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
-
+ KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
RHSKnown.countMinTrailingZeros();
Known.Zero.setLowBits(std::min(TrailZ, 32u));
- unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
- unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
- unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
- if (MaxValBits >= 32)
- break;
+ // Truncate to 24 bits.
+ LHSKnown = LHSKnown.trunc(24);
+ RHSKnown = RHSKnown.trunc(24);
+
bool Negative = false;
if (Opc == AMDGPUISD::MUL_I24) {
- bool LHSNegative = !!(LHSKnown.One & (1 << 23));
- bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
- bool RHSNegative = !!(RHSKnown.One & (1 << 23));
- bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
+ unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
+ unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
+ unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+ if (MaxValBits >= 32)
+ break;
+ bool LHSNegative = LHSKnown.isNegative();
+ bool LHSPositive = LHSKnown.isNonNegative();
+ bool RHSNegative = RHSKnown.isNegative();
+ bool RHSPositive = RHSKnown.isNonNegative();
if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
break;
Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
- }
- if (Negative)
- Known.One.setHighBits(32 - MaxValBits);
- else
+ if (Negative)
+ Known.One.setHighBits(32 - MaxValBits);
+ else
+ Known.Zero.setHighBits(32 - MaxValBits);
+ } else {
+ unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
+ unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
+ unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+ if (MaxValBits >= 32)
+ break;
Known.Zero.setHighBits(32 - MaxValBits);
+ }
break;
}
case AMDGPUISD::PERM: {
@@ -4244,9 +4344,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
if (!CMask)
return;
- KnownBits LHSKnown, RHSKnown;
- DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
- DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+ KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
unsigned Sel = CMask->getZExtValue();
for (unsigned I = 0; I < 32; I += 8) {
@@ -4320,3 +4419,107 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
return 1;
}
}
+
+bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ bool SNaN,
+ unsigned Depth) const {
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case AMDGPUISD::FMIN_LEGACY:
+ case AMDGPUISD::FMAX_LEGACY: {
+ if (SNaN)
+ return true;
+
+ // TODO: Can check no nans on one of the operands for each one, but which
+ // one?
+ return false;
+ }
+ case AMDGPUISD::FMUL_LEGACY:
+ case AMDGPUISD::CVT_PKRTZ_F16_F32: {
+ if (SNaN)
+ return true;
+ return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+ DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+ }
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMAD_FTZ: {
+ if (SNaN)
+ return true;
+ return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+ DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+ DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+ }
+ case AMDGPUISD::CVT_F32_UBYTE0:
+ case AMDGPUISD::CVT_F32_UBYTE1:
+ case AMDGPUISD::CVT_F32_UBYTE2:
+ case AMDGPUISD::CVT_F32_UBYTE3:
+ return true;
+
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RSQ_CLAMP: {
+ if (SNaN)
+ return true;
+
+ // TODO: Need is known positive check.
+ return false;
+ }
+ case AMDGPUISD::LDEXP:
+ case AMDGPUISD::FRACT: {
+ if (SNaN)
+ return true;
+ return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+ }
+ case AMDGPUISD::DIV_SCALE:
+ case AMDGPUISD::DIV_FMAS:
+ case AMDGPUISD::DIV_FIXUP:
+ case AMDGPUISD::TRIG_PREOP:
+ // TODO: Refine on operands.
+ return SNaN;
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::COS_HW: {
+ // TODO: Need check for infinity
+ return SNaN;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrinsicID
+ = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ // TODO: Handle more intrinsics
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_cubeid:
+ return true;
+
+ case Intrinsic::amdgcn_frexp_mant: {
+ if (SNaN)
+ return true;
+ return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+ }
+ case Intrinsic::amdgcn_cvt_pkrtz: {
+ if (SNaN)
+ return true;
+ return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+ DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+ }
+ case Intrinsic::amdgcn_fdot2:
+ // TODO: Refine on operand
+ return SNaN;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+}
+
+TargetLowering::AtomicExpansionKind
+AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+ if (RMW->getOperation() == AtomicRMWInst::Nand)
+ return AtomicExpansionKind::CmpXChg;
+ return AtomicExpansionKind::None;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a4c3b413e1037..0d22cb2e3e20b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -41,8 +41,6 @@ public:
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
protected:
- AMDGPUAS AMDGPUASI;
-
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
/// Split a vector store into multiple scalar stores.
@@ -58,8 +56,9 @@ protected:
SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag,
+ SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
double Log2BaseInverted) const;
+ SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
@@ -95,6 +94,8 @@ protected:
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ bool isConstantCostlierToNegate(SDValue N) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -246,6 +247,11 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ bool isKnownNeverNaNForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ bool SNaN = false,
+ unsigned Depth = 0) const override;
+
/// Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
@@ -279,7 +285,6 @@ public:
SDValue storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
- SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const;
@@ -299,13 +304,11 @@ public:
uint32_t getImplicitParameterOffset(const MachineFunction &MF,
const ImplicitParameter Param) const;
- AMDGPUAS getAMDGPUAS() const {
- return AMDGPUASI;
- }
-
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
+
+ AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
};
namespace AMDGPUISD {
@@ -357,6 +360,7 @@ enum NodeType : unsigned {
SIN_HW,
FMAX_LEGACY,
FMIN_LEGACY,
+
FMAX3,
SMAX3,
UMAX3,
@@ -479,6 +483,7 @@ enum NodeType : unsigned {
BUFFER_LOAD,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_D16,
+ SBUFFER_LOAD,
BUFFER_STORE,
BUFFER_STORE_FORMAT,
BUFFER_STORE_FORMAT_D16,
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index 35dd9eb0a478d..945c9acd379a5 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -44,7 +44,7 @@ ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
cl::desc("Cost of alloca argument"));
// If the amount of scratch memory to eliminate exceeds our ability to allocate
-// it into registers we gain nothing by agressively inlining functions for that
+// it into registers we gain nothing by aggressively inlining functions for that
// heuristic.
static cl::opt<unsigned>
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
@@ -118,8 +118,6 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
if (!Callee)
return (unsigned)Thres;
- const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent());
-
// If we have a pointer to private array passed into a function
// it will not be optimized out, leaving scratch usage.
// Increase the inline threshold to allow inliniting in this case.
@@ -128,7 +126,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
for (Value *PtrArg : CS.args()) {
Type *Ty = PtrArg->getType();
if (!Ty->isPointerTy() ||
- Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+ Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
continue;
PtrArg = GetUnderlyingObject(PtrArg, DL);
if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
@@ -174,18 +172,23 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
Function *Caller = CS.getCaller();
TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
- if (!Callee || Callee->isDeclaration() || CS.isNoInline() ||
- !TTI.areInlineCompatible(Caller, Callee))
- return llvm::InlineCost::getNever();
+ if (!Callee || Callee->isDeclaration())
+ return llvm::InlineCost::getNever("undefined callee");
+
+ if (CS.isNoInline())
+ return llvm::InlineCost::getNever("noinline");
+
+ if (!TTI.areInlineCompatible(Caller, Callee))
+ return llvm::InlineCost::getNever("incompatible");
if (CS.hasFnAttr(Attribute::AlwaysInline)) {
if (isInlineViable(*Callee))
- return llvm::InlineCost::getAlways();
- return llvm::InlineCost::getNever();
+ return llvm::InlineCost::getAlways("alwaysinline viable");
+ return llvm::InlineCost::getNever("alwaysinline unviable");
}
if (isWrapperOnlyCall(CS))
- return llvm::InlineCost::getAlways();
+ return llvm::InlineCost::getAlways("wrapper-only call");
InlineParams LocalParams = Params;
LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 7442a59e594f1..82644be265638 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,18 +62,10 @@ def AMDGPULoopOp : SDTypeProfile<0, 2,
[SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
>;
-def AMDGPUBreakOp : SDTypeProfile<1, 1,
- [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
->;
-
def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
>;
-def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
- [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
->;
-
def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 219d430fbb395..8eb49d49b2e08 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -55,7 +55,6 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
#define GET_GLOBALISEL_TEMPORARIES_INIT
#include "AMDGPUGenGlobalISel.inc"
#undef GET_GLOBALISEL_TEMPORARIES_INIT
- ,AMDGPUASI(STI.getAMDGPUAS())
{
}
@@ -506,8 +505,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
if (!I.hasOneMemOperand())
return false;
- if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+ if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+ (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return false;
if (!isInstrUniform(I))
@@ -631,6 +630,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
return selectImpl(I, CoverageInfo);
case TargetOpcode::G_ADD:
return selectG_ADD(I);
+ case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 68b40b20aca24..449431adc561a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -105,9 +105,6 @@ private:
#define GET_GLOBALISEL_TEMPORARIES_DECL
#include "AMDGPUGenGlobalISel.inc"
#undef GET_GLOBALISEL_TEMPORARIES_DECL
-
-protected:
- AMDGPUAS AMDGPUASI;
};
} // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index c9c932ef2f5fb..eb8f2002ff2dc 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -135,6 +135,12 @@ def brtarget : Operand<OtherVT>;
// Misc. PatFrags
//===----------------------------------------------------------------------===//
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0),
+ (op $src0),
+ [{ return N->hasOneUse(); }]
+>;
+
class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
@@ -152,13 +158,21 @@ def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
def umax_oneuse : HasOneUseBinOp<umax>;
def umin_oneuse : HasOneUseBinOp<umin>;
+
def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+
+def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
+def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
+
+
def and_oneuse : HasOneUseBinOp<and>;
def or_oneuse : HasOneUseBinOp<or>;
def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def not_oneuse : HasOneUseUnaryOp<not>;
+
def add_oneuse : HasOneUseBinOp<add>;
def sub_oneuse : HasOneUseBinOp<sub>;
@@ -167,6 +181,9 @@ def shl_oneuse : HasOneUseBinOp<shl>;
def select_oneuse : HasOneUseTernaryOp<select>;
+def AMDGPUmul_u24_oneuse : HasOneUseBinOp<AMDGPUmul_u24>;
+def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>;
+
def srl_16 : PatFrag<
(ops node:$src0), (srl_oneuse node:$src0, (i32 16))
>;
@@ -328,37 +345,37 @@ class StoreHi16<SDPatternOperator op> : PatFrag <
>;
class PrivateAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
}]>;
class ConstantAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
}]>;
class LocalAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
class GlobalAddress : CodePatPred<[{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
class GlobalLoadAddress : CodePatPred<[{
auto AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS;
+ return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS;
}]>;
class FlatLoadAddress : CodePatPred<[{
const auto AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.GLOBAL_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS;
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
}]>;
class FlatStoreAddress : CodePatPred<[{
const auto AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.GLOBAL_ADDRESS;
+ return AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
@@ -480,7 +497,7 @@ def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
class local_binary_atomic_op<SDNode atomic_op> :
PatFrag<(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value), [{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
@@ -497,14 +514,14 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
def mskor_global : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
- return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
+ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
(ops node:$ptr, node:$cmp, node:$swap),
(cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
+ return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
@@ -513,17 +530,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> {
def "" : PatFrag<
(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
def _noret : PatFrag<
(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
def _ret : PatFrag<
(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
}
defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
@@ -550,12 +567,12 @@ def atomic_cmp_swap_global : PatFrag<
def atomic_cmp_swap_global_noret : PatFrag<
(ops node:$ptr, node:$cmp, node:$value),
(atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
def atomic_cmp_swap_global_ret : PatFrag<
(ops node:$ptr, node:$cmp, node:$value),
(atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
//===----------------------------------------------------------------------===//
// Misc Pattern Fragments
@@ -787,18 +804,30 @@ class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(BIT_ALIGN $src0, $src0, $src1)
>;
-// This matches 16 permutations of
-// max(min(x, y), min(max(x, y), z))
-class IntMed3Pat<Instruction med3Inst,
+multiclass IntMed3Pat<Instruction med3Inst,
+ SDPatternOperator min,
SDPatternOperator max,
- SDPatternOperator max_oneuse,
SDPatternOperator min_oneuse,
- ValueType vt = i32> : AMDGPUPat<
+ SDPatternOperator max_oneuse,
+ ValueType vt = i32> {
+
+ // This matches 16 permutations of
+ // min(max(a, b), max(min(a, b), c))
+ def : AMDGPUPat <
+ (min (max_oneuse vt:$src0, vt:$src1),
+ (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
+ (med3Inst vt:$src0, vt:$src1, vt:$src2)
+>;
+
+ // This matches 16 permutations of
+ // max(min(x, y), min(max(x, y), z))
+ def : AMDGPUPat <
(max (min_oneuse vt:$src0, vt:$src1),
(min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
(med3Inst $src0, $src1, $src2)
>;
-
+}
+
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <
@@ -813,6 +842,7 @@ def cvt_flr_i32_f32 : PatFrag <
[{ (void)N; return TM.Options.NoNaNsFPMath; }]
>;
+let AddedComplexity = 2 in {
class IMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
(add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
!if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)),
@@ -824,6 +854,7 @@ class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat <
!if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)),
(Inst $src0, $src1, $src2))
>;
+} // AddedComplexity.
class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat <
(fdiv FP_ONE, vt:$src),
@@ -834,3 +865,25 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
(AMDGPUrcp (fsqrt vt:$src)),
(RsqInst $src)
>;
+
+// Instructions which select to the same v_min_f*
+def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
+ [(fminnum_ieee node:$src0, node:$src1),
+ (fminnum node:$src0, node:$src1)]
+>;
+
+// Instructions which select to the same v_max_f*
+def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
+ [(fmaxnum_ieee node:$src0, node:$src1),
+ (fmaxnum node:$src0, node:$src1)]
+>;
+
+def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+ [(fminnum_ieee_oneuse node:$src0, node:$src1),
+ (fminnum_oneuse node:$src0, node:$src1)]
+>;
+
+def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+ [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
+ (fmaxnum_oneuse node:$src0, node:$src1)]
+>;
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 896e2055cf620..02108ca3ddd78 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -40,7 +40,7 @@ StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
if (IntrID < Intrinsic::num_intrinsics)
return StringRef();
- assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+ assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics &&
"Invalid intrinsic ID");
return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
@@ -91,7 +91,7 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
= cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
AttributeList AS =
- getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID));
+ getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID));
F->setAttributes(AS);
return F;
}
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
index ef42f9a319af6..a1a094dded23d 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -20,7 +20,7 @@
namespace llvm {
class TargetMachine;
-namespace AMDGPUIntrinsic {
+namespace SIIntrinsic {
enum ID {
last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
#define GET_INTRINSIC_ENUM_VALUES
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
deleted file mode 100644
index 230a046285047..0000000000000
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines intrinsics that are used by all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "AMDGPU", isTarget = 1 in {
- def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
-}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 87b072c9ea20a..ef85c1040545f 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -32,20 +32,52 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
};
- auto AMDGPUAS = ST.getAMDGPUAS();
-
const LLT S1 = LLT::scalar(1);
- const LLT V2S16 = LLT::vector(2, 16);
-
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
const LLT S512 = LLT::scalar(512);
+ const LLT V2S16 = LLT::vector(2, 16);
+ const LLT V4S16 = LLT::vector(4, 16);
+ const LLT V8S16 = LLT::vector(8, 16);
+
+ const LLT V2S32 = LLT::vector(2, 32);
+ const LLT V3S32 = LLT::vector(3, 32);
+ const LLT V4S32 = LLT::vector(4, 32);
+ const LLT V5S32 = LLT::vector(5, 32);
+ const LLT V6S32 = LLT::vector(6, 32);
+ const LLT V7S32 = LLT::vector(7, 32);
+ const LLT V8S32 = LLT::vector(8, 32);
+ const LLT V9S32 = LLT::vector(9, 32);
+ const LLT V10S32 = LLT::vector(10, 32);
+ const LLT V11S32 = LLT::vector(11, 32);
+ const LLT V12S32 = LLT::vector(12, 32);
+ const LLT V13S32 = LLT::vector(13, 32);
+ const LLT V14S32 = LLT::vector(14, 32);
+ const LLT V15S32 = LLT::vector(15, 32);
+ const LLT V16S32 = LLT::vector(16, 32);
+
+ const LLT V2S64 = LLT::vector(2, 64);
+ const LLT V3S64 = LLT::vector(3, 64);
+ const LLT V4S64 = LLT::vector(4, 64);
+ const LLT V5S64 = LLT::vector(5, 64);
+ const LLT V6S64 = LLT::vector(6, 64);
+ const LLT V7S64 = LLT::vector(7, 64);
+ const LLT V8S64 = LLT::vector(8, 64);
+
+ std::initializer_list<LLT> AllS32Vectors =
+ {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
+ V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
+ std::initializer_list<LLT> AllS64Vectors =
+ {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
+
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
- const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS);
- const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS);
+ const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
+ const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
+
+ const LLT CodePtr = FlatPtr;
const LLT AddrSpaces[] = {
GlobalPtr,
@@ -55,13 +87,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
PrivatePtr
};
+ setAction({G_BRCOND, S1}, Legal);
+
setAction({G_ADD, S32}, Legal);
setAction({G_ASHR, S32}, Legal);
setAction({G_SUB, S32}, Legal);
setAction({G_MUL, S32}, Legal);
- setAction({G_AND, S32}, Legal);
- setAction({G_OR, S32}, Legal);
- setAction({G_XOR, S32}, Legal);
+
+ // FIXME: 64-bit ones only legal for scalar
+ getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
+ .legalFor({S32, S1, S64, V2S32});
+
+ getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
+ G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
+ .legalFor({{S32, S1}});
setAction({G_BITCAST, V2S16}, Legal);
setAction({G_BITCAST, 1, S32}, Legal);
@@ -90,35 +129,80 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
// between these two scenarios.
setAction({G_CONSTANT, S1}, Legal);
- setAction({G_FADD, S32}, Legal);
+ setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+
+ getActionDefinitionsBuilder(
+ { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA})
+ .legalFor({S32, S64});
+
+ getActionDefinitionsBuilder(G_FPTRUNC)
+ .legalFor({{S32, S64}});
+
+ // Use actual fsub instruction
+ setAction({G_FSUB, S32}, Legal);
+
+ // Must use fadd + fneg
+ setAction({G_FSUB, S64}, Lower);
setAction({G_FCMP, S1}, Legal);
setAction({G_FCMP, 1, S32}, Legal);
setAction({G_FCMP, 1, S64}, Legal);
- setAction({G_FMUL, S32}, Legal);
-
setAction({G_ZEXT, S64}, Legal);
setAction({G_ZEXT, 1, S32}, Legal);
+ setAction({G_SEXT, S64}, Legal);
+ setAction({G_SEXT, 1, S32}, Legal);
+
+ setAction({G_ANYEXT, S64}, Legal);
+ setAction({G_ANYEXT, 1, S32}, Legal);
+
setAction({G_FPTOSI, S32}, Legal);
setAction({G_FPTOSI, 1, S32}, Legal);
setAction({G_SITOFP, S32}, Legal);
setAction({G_SITOFP, 1, S32}, Legal);
+ setAction({G_UITOFP, S32}, Legal);
+ setAction({G_UITOFP, 1, S32}, Legal);
+
setAction({G_FPTOUI, S32}, Legal);
setAction({G_FPTOUI, 1, S32}, Legal);
+ setAction({G_FPOW, S32}, Legal);
+ setAction({G_FEXP2, S32}, Legal);
+ setAction({G_FLOG2, S32}, Legal);
+
+ getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND})
+ .legalFor({S32, S64});
+
for (LLT PtrTy : AddrSpaces) {
LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits());
setAction({G_GEP, PtrTy}, Legal);
setAction({G_GEP, 1, IdxTy}, Legal);
}
+ setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+
setAction({G_ICMP, S1}, Legal);
setAction({G_ICMP, 1, S32}, Legal);
+ setAction({G_CTLZ, S32}, Legal);
+ setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal);
+ setAction({G_CTTZ, S32}, Legal);
+ setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal);
+ setAction({G_BSWAP, S32}, Legal);
+ setAction({G_CTPOP, S32}, Legal);
+
+ getActionDefinitionsBuilder(G_INTTOPTR)
+ .legalIf([](const LegalityQuery &Query) {
+ return true;
+ });
+
+ getActionDefinitionsBuilder(G_PTRTOINT)
+ .legalIf([](const LegalityQuery &Query) {
+ return true;
+ });
getActionDefinitionsBuilder({G_LOAD, G_STORE})
.legalIf([=, &ST](const LegalityQuery &Query) {
@@ -145,6 +229,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
});
+ auto &Atomics = getActionDefinitionsBuilder(
+ {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
+ G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
+ G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
+ G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
+ {S64, GlobalPtr}, {S64, LocalPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
+ }
setAction({G_SELECT, S32}, Legal);
setAction({G_SELECT, 1, S1}, Legal);
@@ -180,6 +274,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
(Ty1.getSizeInBits() % 32 == 0);
});
+ getActionDefinitionsBuilder(G_BUILD_VECTOR)
+ .legalForCartesianProduct(AllS32Vectors, {S32})
+ .legalForCartesianProduct(AllS64Vectors, {S64})
+ .clampNumElements(0, V16S32, V16S32)
+ .clampNumElements(0, V2S64, V8S64)
+ .minScalarSameAs(1, 0);
+
+ // TODO: Support any combination of v2s32
+ getActionDefinitionsBuilder(G_CONCAT_VECTORS)
+ .legalFor({{V4S32, V2S32},
+ {V8S32, V2S32},
+ {V8S32, V4S32},
+ {V4S64, V2S64},
+ {V4S16, V2S16},
+ {V8S16, V2S16},
+ {V8S16, V4S16}});
+
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 7a7ed7a4f0656..14e8800426911 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1333,8 +1333,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
// for OpenCL 2.0 we have only generic implementation of sincos
// function.
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
- const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
- nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
+ nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
if (!Fsincos) return false;
@@ -1347,7 +1346,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
// The allocaInst allocates the memory in private address space. This need
// to be bitcasted to point to the address space of cos pointer type.
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
- if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+ if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
P = B.CreateAddrSpaceCast(Alloc, PTy);
CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 4671273d61f91..4fc3fe0f105b0 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -90,7 +90,6 @@ class UnmangledFuncInfo {
public:
using ID = AMDGPULibFunc::EFuncId;
- UnmangledFuncInfo() = default;
UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
: Name(_Name), NumArgs(_NumArgs) {}
// Get index to Table by function name.
@@ -996,8 +995,10 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
} else {
AttributeList Attr;
LLVMContext &Ctx = M->getContext();
- Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
- Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
+ Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
+ Attribute::ReadOnly);
+ Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
}
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c147830e12ed6..743dc7a0d00b9 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -16,7 +16,6 @@
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -84,8 +83,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
return false;
CallInst *KernArgSegment =
- Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
- F.getName() + ".kernarg.segment");
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
+ nullptr, F.getName() + ".kernarg.segment");
KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
KernArgSegment->addAttribute(AttributeList::ReturnIndex,
@@ -123,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
VectorType *VT = dyn_cast<VectorType>(ArgTy);
bool IsV3 = VT && VT->getNumElements() == 3;
+ bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
+
VectorType *V4Ty = nullptr;
int64_t AlignDownOffset = alignDown(EltOffset, 4);
int64_t OffsetDiff = EltOffset - AlignDownOffset;
- unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
+ unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
+ KernArgBaseAlign);
Value *ArgPtr;
- if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
+ if (DoShiftOpt) { // FIXME: Handle aggregate types
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
// bits.
@@ -148,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
} else {
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
KernArgSegment,
- AlignDownOffset,
+ EltOffset,
Arg.getName() + ".kernarg.offset");
ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
ArgPtr->getName() + ".cast");
@@ -199,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
// TODO: Convert noalias arg to !noalias
- if (Size < 32 && !ArgTy->isAggregateType()) {
+ if (DoShiftOpt) {
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 1876dc3f71221..f6bdbf5e9be2c 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -301,6 +301,26 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
+#ifdef EXPENSIVE_CHECKS
+ // Sanity-check getInstSizeInBytes on explicitly specified CPUs (it cannot
+ // work correctly for the generic CPU).
+ //
+ // The isPseudo check really shouldn't be here, but unfortunately there are
+ // some negative lit tests that depend on being able to continue through
+ // here even when pseudo instructions haven't been lowered.
+ if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) {
+ SmallVector<MCFixup, 4> Fixups;
+ SmallVector<char, 16> CodeBytes;
+ raw_svector_ostream CodeStream(CodeBytes);
+
+ std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter(
+ *STI.getInstrInfo(), *OutContext.getRegisterInfo(), OutContext));
+ InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI);
+
+ assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
+ }
+#endif
+
if (STI.dumpCode()) {
// Disassemble instruction/operands to text.
DisasmLines.resize(DisasmLines.size() + 1);
diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 995d9ae3907fc..5e0b7d4290220 100644
--- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -42,9 +42,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
if (!FirstMI)
return true;
+ const MachineBasicBlock &MBB = *FirstMI->getParent();
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
const MachineOperand *Src2 = TII.getNamedOperand(SecondMI,
AMDGPU::OpName::src2);
- return FirstMI->definesRegister(Src2->getReg());
+ return FirstMI->definesRegister(Src2->getReg(), TRI);
}
default:
return false;
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index b50a2eb8e9e71..2feff14d34a15 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -23,7 +23,8 @@ namespace ElfNote {
const char SectionName[] = ".note";
-const char NoteName[] = "AMD";
+const char NoteNameV2[] = "AMD";
+const char NoteNameV3[] = "AMDGPU";
// TODO: Remove this file once we drop code object v2.
enum NoteType{
diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 3cfdccc9fe51a..e53a8fe7c074d 100644
--- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -99,8 +99,6 @@ private:
const DataLayout *DL;
- AMDGPUAS AS;
-
const TargetLowering *TLI;
void visit(const Function &F);
@@ -267,7 +265,6 @@ void AMDGPUPerfHint::runOnFunction(Function &F) {
const Module &M = *F.getParent();
DL = &M.getDataLayout();
- AS = AMDGPU::getAMDGPUAS(M);
visit(F);
auto Loc = FIM.find(&F);
@@ -306,14 +303,14 @@ bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
if (auto PT = dyn_cast<PointerType>(V->getType())) {
unsigned As = PT->getAddressSpace();
// Flat likely points to global too.
- return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS;
+ return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
}
return false;
}
bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
if (auto PT = dyn_cast<PointerType>(V->getType()))
- return PT->getAddressSpace() == AS.LOCAL_ADDRESS;
+ return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
return false;
}
@@ -346,7 +343,8 @@ AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
bool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
if (auto PT = dyn_cast<PointerType>(V->getType())) {
unsigned As = PT->getAddressSpace();
- return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT;
+ return As == AMDGPUAS::CONSTANT_ADDRESS ||
+ As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
}
return false;
}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index d341fec6296fb..5d087c0991844 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,13 +70,17 @@ static cl::opt<bool> DisablePromoteAllocaToVector(
cl::desc("Disable promote alloca to vector"),
cl::init(false));
+static cl::opt<bool> DisablePromoteAllocaToLDS(
+ "disable-promote-alloca-to-lds",
+ cl::desc("Disable promote alloca to LDS"),
+ cl::init(false));
+
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
const TargetMachine *TM;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
- AMDGPUAS AS;
// FIXME: This should be per-kernel.
uint32_t LocalMemLimit = 0;
@@ -156,8 +160,6 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
if (!ST.isPromoteAllocaEnabled())
return false;
- AS = AMDGPU::getAMDGPUAS(*F.getParent());
-
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@@ -238,7 +240,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
Value *CastDispatchPtr = Builder.CreateBitCast(
- DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
+ DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
// We could do a single 64-bit load here, but it's likely that the basic
// 32-bit and extract sequence is already present, and it is probably easier
@@ -326,6 +328,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
// Currently only handle the case where the Pointer Operand is a GEP.
// Also we could not vectorize volatile or atomic loads.
LoadInst *LI = cast<LoadInst>(Inst);
+ if (isa<AllocaInst>(User) &&
+ LI->getPointerOperandType() == User->getType() &&
+ isa<VectorType>(LI->getType()))
+ return true;
return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
}
case Instruction::BitCast:
@@ -335,6 +341,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
// since it should be canonical form, the User should be a GEP.
// Also we could not vectorize volatile or atomic stores.
StoreInst *SI = cast<StoreInst>(Inst);
+ if (isa<AllocaInst>(User) &&
+ SI->getPointerOperandType() == User->getType() &&
+ isa<VectorType>(SI->getValueOperand()->getType()))
+ return true;
return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
}
default:
@@ -342,14 +352,15 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
}
}
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
return false;
}
- ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
+ Type *AT = Alloca->getAllocatedType();
+ SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
@@ -396,7 +407,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
}
}
- VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+ VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
+ if (!VectorTy)
+ VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
@@ -406,7 +419,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
- Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
+ if (Inst->getType() == AT)
+ break;
+
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
@@ -418,9 +434,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
break;
}
case Instruction::Store: {
- Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
-
StoreInst *SI = cast<StoreInst>(Inst);
+ if (SI->getValueOperand()->getType() == AT)
+ break;
+
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
@@ -610,7 +628,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
// we cannot use local memory in the pass.
for (Type *ParamTy : FTy->params()) {
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
- if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+ if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
LocalMemLimit = 0;
LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
"local memory disabled.\n");
@@ -627,7 +645,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
// Check how much local memory is being used by global objects
CurrentLocalMemUsage = 0;
for (GlobalVariable &GV : Mod->globals()) {
- if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+ if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
for (const User *U : GV.users()) {
@@ -706,9 +724,12 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
- if (tryPromoteAllocaToVector(&I, AS))
+ if (tryPromoteAllocaToVector(&I))
return true; // Promoted to vector.
+ if (DisablePromoteAllocaToLDS)
+ return false;
+
const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -775,7 +796,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
Twine(F->getName()) + Twine('.') + I.getName(),
nullptr,
GlobalVariable::NotThreadLocal,
- AS.LOCAL_ADDRESS);
+ AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
GV->setAlignment(I.getAlignment());
@@ -808,7 +829,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
Value *Src0 = CI->getOperand(0);
Type *EltTy = Src0->getType()->getPointerElementType();
- PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
if (isa<ConstantPointerNull>(CI->getOperand(0)))
CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -825,7 +846,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
continue;
Type *EltTy = V->getType()->getPointerElementType();
- PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
// FIXME: It doesn't really make sense to try to do this for all
// instructions.
@@ -894,7 +915,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
Type *SrcTy = Src->getType()->getPointerElementType();
Function *ObjectSize = Intrinsic::getDeclaration(Mod,
Intrinsic::objectsize,
- { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
+ { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
);
CallInst *NewCall = Builder.CreateCall(
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 012e4fe200aae..7a760dcf7a908 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -35,7 +35,7 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
: AMDGPUGenRegisterBankInfo(),
TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
- // HACK: Until this is fully tablegen'd
+ // HACK: Until this is fully tablegen'd.
static bool AlreadyInit = false;
if (AlreadyInit)
return;
@@ -74,13 +74,16 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
unsigned Size) const {
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
- Src.getID() == AMDGPU::VGPRRegBankID)
+ Src.getID() == AMDGPU::VGPRRegBankID) {
return std::numeric_limits<unsigned>::max();
+ }
// SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by
// the valu.
if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID &&
- Src.getID() == AMDGPU::SGPRRegBankID)
+ (Src.getID() == AMDGPU::SGPRRegBankID ||
+ Src.getID() == AMDGPU::VGPRRegBankID ||
+ Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
return RegisterBankInfo::copyCost(Dst, Src, Size);
@@ -145,7 +148,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&SSMapping);
const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
@@ -153,7 +156,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&SVMapping);
const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
@@ -161,7 +164,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&VSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
@@ -170,6 +173,67 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
+ case TargetOpcode::G_SELECT: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&SSMapping);
+
+ const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
+ 4); // Num Operands
+ AltMappings.push_back(&VVMapping);
+
+ return AltMappings;
+ }
+ case TargetOpcode::G_UADDE:
+ case TargetOpcode::G_USUBE:
+ case TargetOpcode::G_SADDE:
+ case TargetOpcode::G_SSUBE: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
+ getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
+ 5); // Num Operands
+ AltMappings.push_back(&SSMapping);
+
+ const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
+ 5); // Num Operands
+ AltMappings.push_back(&VVMapping);
+ return AltMappings;
+ }
+ case AMDGPU::G_BRCOND: {
+ assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
+
+ const InstructionMapping &SMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
+ 2); // Num Operands
+ AltMappings.push_back(&SMapping);
+
+ const InstructionMapping &VMapping = getInstructionMapping(
+ 1, 1, getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
+ 2); // Num Operands
+ AltMappings.push_back(&VMapping);
+ return AltMappings;
+ }
default:
break;
}
@@ -193,10 +257,16 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
+ if (!MI.getOperand(i).isReg())
+ continue;
unsigned Reg = MI.getOperand(i).getReg();
- const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
- if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID)
- return false;
+ if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
+ if (Bank->getID() == AMDGPU::VGPRRegBankID)
+ return false;
+
+ assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
+ Bank->getID() == AMDGPU::SCCRegBankID);
+ }
}
return true;
}
@@ -209,7 +279,8 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
- OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
+ OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
@@ -230,12 +301,32 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
unsigned Reg1 = MI.getOperand(OpdIdx).getReg();
unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
- unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI);
+
+ unsigned DefaultBankID = Size1 == 1 ?
+ AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
+ unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
+
OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI);
- OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
+ OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
+ }
+
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -304,21 +395,49 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return getInvalidInstructionMapping();
+
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Size == 1) {
+ OpdsMapping[0] = OpdsMapping[1] =
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
- case AMDGPU::G_AND:
- case AMDGPU::G_OR:
- case AMDGPU::G_XOR:
case AMDGPU::G_SHL:
+ case AMDGPU::G_UADDO:
+ case AMDGPU::G_SADDO:
+ case AMDGPU::G_USUBO:
+ case AMDGPU::G_SSUBO:
+ case AMDGPU::G_UADDE:
+ case AMDGPU::G_SADDE:
+ case AMDGPU::G_USUBE:
+ case AMDGPU::G_SSUBE:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
- // Fall-through
+ LLVM_FALLTHROUGH;
case AMDGPU::G_FADD:
+ case AMDGPU::G_FSUB:
case AMDGPU::G_FPTOSI:
case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
+ case AMDGPU::G_FMA:
+ case AMDGPU::G_SITOFP:
+ case AMDGPU::G_UITOFP:
+ case AMDGPU::G_FPTRUNC:
+ case AMDGPU::G_FEXP2:
+ case AMDGPU::G_FLOG2:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_INTRINSIC_ROUND:
return getDefaultMappingVOP(MI);
case AMDGPU::G_IMPLICIT_DEF: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -326,11 +445,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_FCONSTANT:
- case AMDGPU::G_CONSTANT: {
+ case AMDGPU::G_CONSTANT:
+ case AMDGPU::G_FRAME_INDEX:
+ case AMDGPU::G_BLOCK_ADDR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case AMDGPU::G_INSERT: {
+ unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
+ AMDGPU::VGPRRegBankID;
+ unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
+ OpdsMapping[3] = nullptr;
+ break;
+ }
case AMDGPU::G_EXTRACT: {
unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -352,7 +485,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
- case AMDGPU::G_BITCAST: {
+ case AMDGPU::G_BITCAST:
+ case AMDGPU::G_INTTOPTR:
+ case AMDGPU::G_PTRTOINT:
+ case AMDGPU::G_CTLZ:
+ case AMDGPU::G_CTLZ_ZERO_UNDEF:
+ case AMDGPU::G_CTTZ:
+ case AMDGPU::G_CTTZ_ZERO_UNDEF:
+ case AMDGPU::G_CTPOP:
+ case AMDGPU::G_BSWAP:
+ case AMDGPU::G_FABS:
+ case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
@@ -368,7 +511,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
- case AMDGPU::G_ZEXT: {
+ case AMDGPU::G_ZEXT:
+ case AMDGPU::G_SEXT:
+ case AMDGPU::G_ANYEXT: {
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
@@ -391,7 +536,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FCMP: {
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
@@ -431,7 +576,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID ?
- AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID;
+ AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
@@ -479,6 +624,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
+ case AMDGPU::G_UNMERGE_VALUES: {
+ unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
+ AMDGPU::VGPRRegBankID;
+
+ // Op1 and Dst should use the same register bank.
+ // FIXME: Shouldn't this be the default? Why do we need to handle this?
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+ OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
+ }
+ break;
+ }
case AMDGPU::G_INTRINSIC: {
switch (MI.getOperand(1).getIntrinsicID()) {
default:
@@ -492,6 +649,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case Intrinsic::amdgcn_wqm_vote: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = OpdsMapping[2]
+ = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
}
break;
}
@@ -528,8 +691,50 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
+ case AMDGPU::G_SELECT: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID &&
+ Op2Bank == AMDGPU::SGPRRegBankID &&
+ Op3Bank == AMDGPU::SGPRRegBankID;
+ unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
+ break;
+ }
+
case AMDGPU::G_LOAD:
return getInstrMappingForLoad(MI);
+
+ case AMDGPU::G_ATOMICRMW_XCHG:
+ case AMDGPU::G_ATOMICRMW_ADD:
+ case AMDGPU::G_ATOMICRMW_SUB:
+ case AMDGPU::G_ATOMICRMW_AND:
+ case AMDGPU::G_ATOMICRMW_OR:
+ case AMDGPU::G_ATOMICRMW_XOR:
+ case AMDGPU::G_ATOMICRMW_MAX:
+ case AMDGPU::G_ATOMICRMW_MIN:
+ case AMDGPU::G_ATOMICRMW_UMAX:
+ case AMDGPU::G_ATOMICRMW_UMIN:
+ case AMDGPU::G_ATOMIC_CMPXCHG: {
+ return getDefaultMappingAllVGPR(MI);
+ }
+ case AMDGPU::G_BRCOND: {
+ unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
+ if (Bank != AMDGPU::SCCRegBankID)
+ Bank = AMDGPU::VCCRegBankID;
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
+ break;
+ }
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index d48a665898735..d29f4bc79a519 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -49,6 +49,8 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
bool isSALUMapping(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
+ const InstructionMapping &getDefaultMappingAllVGPR(
+ const MachineInstr &MI) const;
public:
AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 7f7f75f656479..570379a820e12 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -15,4 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
>;
-def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>;
+def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>;
+
+// It is helpful to distinguish conditions from ordinary SGPRs.
+def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 07de5fc549e29..922d974f2ebd6 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -27,8 +27,6 @@ class TargetInstrInfo;
struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
AMDGPURegisterInfo();
- bool enableMultipleCopyHints() const override { return true; }
-
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
static unsigned getSubRegFromChannel(unsigned Channel);
diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index a861762a8c9e3..efe501cb73c27 100644
--- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -163,7 +163,7 @@ bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
// some casts between structs and non-structs, but we can't bitcast
// directly between them. directly bitcast between them. Blender uses
// some casts that look like { <3 x float> }* to <4 x float>*
- if ((SrcEltTy->isStructTy() && (SrcEltTy->getNumContainedTypes() != 1)))
+ if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
return false;
// Clang emits OpenCL 3-vector type accesses with a bitcast to the
@@ -401,8 +401,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (Val->getType() != EltTy) {
Type *EffectiveEltTy = EltTy;
if (StructType *CT = dyn_cast<StructType>(EltTy)) {
- assert(CT->getNumContainedTypes() == 1);
- EffectiveEltTy = CT->getContainedType(0);
+ assert(CT->getNumElements() == 1);
+ EffectiveEltTy = CT->getElementType(0);
}
if (DL->getTypeSizeInBits(EffectiveEltTy) !=
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 98b49070fa99f..ed0cc70c3d9aa 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -74,6 +74,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// We want to be able to turn these off, but making this a subtarget feature
// for SI has the unhelpful behavior that it unsets everything else if you
// disable it.
+ //
+ // Similarly we want enable-prt-strict-null to be on by default and not to
+ // unset everything else if it is disabled
SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
@@ -89,6 +92,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += "-fp32-denormals,";
}
+ FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
+
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
@@ -124,10 +129,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
return *this;
}
-AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
- const FeatureBitset &FeatureBits) :
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
TargetTriple(TT),
- SubtargetFeatureBits(FeatureBits),
Has16BitInsts(false),
HasMadMixInsts(false),
FP32Denormals(false),
@@ -136,19 +139,22 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
HasVOP3PInsts(false),
HasMulI24(true),
HasMulU24(true),
+ HasInv2PiInlineImm(false),
HasFminFmaxLegacy(true),
EnablePromoteAlloca(false),
+ HasTrigReducedRange(false),
LocalMemorySize(0),
WavefrontSize(0)
{ }
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
- const GCNTargetMachine &TM) :
+ const GCNTargetMachine &TM) :
AMDGPUGenSubtargetInfo(TT, GPU, FS),
- AMDGPUSubtarget(TT, getFeatureBits()),
+ AMDGPUSubtarget(TT),
TargetTriple(TT),
Gen(SOUTHERN_ISLANDS),
IsaVersion(ISAVersion0_0_0),
+ InstrItins(getInstrItineraryForCPU(GPU)),
LDSBankCount(0),
MaxPrivateElementSize(0),
@@ -170,16 +176,17 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DebuggerEmitPrologue(false),
EnableHugePrivateBuffer(false),
- EnableVGPRSpilling(false),
EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
EnableDS128(false),
+ EnablePRTStrictNull(false),
DumpCode(false),
FP64(false),
GCN3Encoding(false),
CIInsts(false),
+ VIInsts(false),
GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
@@ -189,15 +196,16 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasVGPRIndexMode(false),
HasScalarStores(false),
HasScalarAtomics(false),
- HasInv2PiInlineImm(false),
HasSDWAOmod(false),
HasSDWAScalar(false),
HasSDWASdst(false),
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
+ HasR128A16(false),
HasDLInsts(false),
- D16PreservesUnusedBits(false),
+ HasDotInsts(false),
+ EnableSRAMECC(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
@@ -211,7 +219,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
- AS = AMDGPU::getAMDGPUAS(TT);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
@@ -447,7 +454,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :
R600GenSubtargetInfo(TT, GPU, FS),
- AMDGPUSubtarget(TT, getFeatureBits()),
+ AMDGPUSubtarget(TT),
InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
FMA(false),
@@ -460,8 +467,7 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
TexVTXClauseSize(0),
Gen(R600),
TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
- InstrItins(getInstrItineraryForCPU(GPU)),
- AS (AMDGPU::getAMDGPUAS(TT)) { }
+ InstrItins(getInstrItineraryForCPU(GPU)) { }
void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const {
@@ -480,10 +486,6 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
-bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
- return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
-}
-
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6231097336518..5584759e55804 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -63,7 +63,6 @@ private:
Triple TargetTriple;
protected:
- const FeatureBitset &SubtargetFeatureBits;
bool Has16BitInsts;
bool HasMadMixInsts;
bool FP32Denormals;
@@ -72,13 +71,15 @@ protected:
bool HasVOP3PInsts;
bool HasMulI24;
bool HasMulU24;
+ bool HasInv2PiInlineImm;
bool HasFminFmaxLegacy;
bool EnablePromoteAlloca;
+ bool HasTrigReducedRange;
int LocalMemorySize;
unsigned WavefrontSize;
public:
- AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits);
+ AMDGPUSubtarget(const Triple &TT);
static const AMDGPUSubtarget &get(const MachineFunction &MF);
static const AMDGPUSubtarget &get(const TargetMachine &TM,
@@ -134,7 +135,7 @@ public:
return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
}
- bool isAmdCodeObjectV2(const Function &F) const {
+ bool isAmdHsaOrMesa(const Function &F) const {
return isAmdHsaOS() || isMesaKernel(F);
}
@@ -170,10 +171,18 @@ public:
return HasMulU24;
}
+ bool hasInv2PiInlineImm() const {
+ return HasInv2PiInlineImm;
+ }
+
bool hasFminFmaxLegacy() const {
return HasFminFmaxLegacy;
}
+ bool hasTrigReducedRange() const {
+ return HasTrigReducedRange;
+ }
+
bool isPromoteAllocaEnabled() const {
return EnablePromoteAlloca;
}
@@ -193,38 +202,26 @@ public:
/// Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const Function &F) const {
- return isAmdCodeObjectV2(F) ? 0 : 36;
+ return isAmdHsaOrMesa(F) ? 0 : 36;
}
/// \returns Maximum number of work groups per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits,
- FlatWorkGroupSize);
- }
+ virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
/// \returns Minimum flat work group size supported by the subtarget.
- unsigned getMinFlatWorkGroupSize() const {
- return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits);
- }
+ virtual unsigned getMinFlatWorkGroupSize() const = 0;
/// \returns Maximum flat work group size supported by the subtarget.
- unsigned getMaxFlatWorkGroupSize() const {
- return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits);
- }
+ virtual unsigned getMaxFlatWorkGroupSize() const = 0;
/// \returns Maximum number of waves per execution unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits,
- FlatWorkGroupSize);
- }
+ virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0;
/// \returns Minimum number of waves per execution unit supported by the
/// subtarget.
- unsigned getMinWavesPerEU() const {
- return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits);
- }
+ virtual unsigned getMinWavesPerEU() const = 0;
unsigned getMaxWavesPerEU() const { return 10; }
@@ -266,6 +263,7 @@ public:
ISAVersion9_0_2,
ISAVersion9_0_4,
ISAVersion9_0_6,
+ ISAVersion9_0_9,
};
enum TrapHandlerAbi {
@@ -300,6 +298,7 @@ protected:
Triple TargetTriple;
unsigned Gen;
unsigned IsaVersion;
+ InstrItineraryData InstrItins;
int LDSBankCount;
unsigned MaxPrivateElementSize;
@@ -323,11 +322,11 @@ protected:
// Used as options.
bool EnableHugePrivateBuffer;
- bool EnableVGPRSpilling;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
bool EnableDS128;
+ bool EnablePRTStrictNull;
bool DumpCode;
// Subtarget statically properties set by tablegen
@@ -337,6 +336,7 @@ protected:
bool IsGCN;
bool GCN3Encoding;
bool CIInsts;
+ bool VIInsts;
bool GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
@@ -346,15 +346,16 @@ protected:
bool HasVGPRIndexMode;
bool HasScalarStores;
bool HasScalarAtomics;
- bool HasInv2PiInlineImm;
bool HasSDWAOmod;
bool HasSDWAScalar;
bool HasSDWASdst;
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
+ bool HasR128A16;
bool HasDLInsts;
- bool D16PreservesUnusedBits;
+ bool HasDotInsts;
+ bool EnableSRAMECC;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
@@ -372,7 +373,6 @@ protected:
bool FeatureDisable;
SelectionDAGTargetInfo TSInfo;
- AMDGPUAS AS;
private:
SIInstrInfo InstrInfo;
SITargetLowering TLInfo;
@@ -423,6 +423,10 @@ public:
return &TSInfo;
}
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
+ }
+
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
Generation getGeneration() const {
@@ -441,10 +445,6 @@ public:
return MaxPrivateElementSize;
}
- AMDGPUAS getAMDGPUAS() const {
- return AS;
- }
-
bool hasIntClamp() const {
return HasIntClamp;
}
@@ -517,6 +517,10 @@ public:
return FMA;
}
+ bool hasSwap() const {
+ return GFX9Insts;
+ }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
@@ -574,12 +578,19 @@ public:
return getGeneration() < AMDGPUSubtarget::GFX9;
}
+ /// \returns If target requires PRT Struct NULL support (zero result registers
+ /// for sparse texture support).
+ bool usePRTStrictNull() const {
+ return EnablePRTStrictNull;
+ }
+
bool hasAutoWaitcntBeforeBarrier() const {
return AutoWaitcntBeforeBarrier;
}
bool hasCodeObjectV3() const {
- return CodeObjectV3;
+ // FIXME: Need to add code object v3 support for mesa and pal.
+ return isAmdHsaOS() ? CodeObjectV3 : false;
}
bool hasUnalignedBufferAccess() const {
@@ -677,8 +688,12 @@ public:
return HasDLInsts;
}
- bool d16PreservesUnusedBits() const {
- return D16PreservesUnusedBits;
+ bool hasDotInsts() const {
+ return HasDotInsts;
+ }
+
+ bool isSRAMECCEnabled() const {
+ return EnableSRAMECC;
}
// Scratch is allocated in 256 dword per wave blocks for the entire
@@ -707,20 +722,19 @@ public:
/// \returns Number of execution units per compute unit supported by the
/// subtarget.
unsigned getEUsPerCU() const {
- return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getEUsPerCU(this);
}
/// \returns Maximum number of waves per compute unit supported by the
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerCU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
}
/// \returns Maximum number of waves per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(),
- FlatWorkGroupSize);
+ return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
}
/// \returns Maximum number of waves per execution unit supported by the
@@ -732,8 +746,7 @@ public:
/// \returns Number of waves per work group supported by the subtarget and
/// limited by given \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getWavesPerWorkGroup(
- MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize);
+ return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
}
// static wrappers
@@ -747,8 +760,6 @@ public:
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
- bool isVGPRSpillingEnabled(const Function &F) const;
-
unsigned getMaxNumUserSGPRs() const {
return 16;
}
@@ -781,14 +792,15 @@ public:
return HasScalarAtomics;
}
- bool hasInv2PiInlineImm() const {
- return HasInv2PiInlineImm;
- }
bool hasDPP() const {
return HasDPP;
}
+ bool hasR128A16() const {
+ return HasR128A16;
+ }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
@@ -817,6 +829,11 @@ public:
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}
+ // \returns true if the subtarget supports DWORDX3 load/store instructions.
+ bool hasDwordx3LoadStores() const {
+ return CIInsts;
+ }
+
bool hasSMovFedHazard() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
@@ -851,39 +868,34 @@ public:
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getSGPRAllocGranule(
- MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
}
/// \returns SGPR encoding granularity supported by the subtarget.
unsigned getSGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getSGPREncodingGranule(
- MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
}
/// \returns Total number of SGPRs supported by the subtarget.
unsigned getTotalNumSGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
}
/// \returns Addressable number of SGPRs supported by the subtarget.
unsigned getAddressableNumSGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumSGPRs(
- MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
}
/// \returns Minimum number of SGPRs that meets the given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(),
- WavesPerEU);
+ return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
}
/// \returns Maximum number of SGPRs that meets the given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
- return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(),
- WavesPerEU, Addressable);
+ return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
}
/// \returns Reserved number of SGPRs for given function \p MF.
@@ -901,39 +913,34 @@ public:
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getVGPRAllocGranule(
- MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
}
/// \returns VGPR encoding granularity supported by the subtarget.
unsigned getVGPREncodingGranule() const {
- return AMDGPU::IsaInfo::getVGPREncodingGranule(
- MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
}
/// \returns Total number of VGPRs supported by the subtarget.
unsigned getTotalNumVGPRs() const {
- return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
}
/// \returns Addressable number of VGPRs supported by the subtarget.
unsigned getAddressableNumVGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumVGPRs(
- MCSubtargetInfo::getFeatureBits());
+ return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
}
/// \returns Minimum number of VGPRs that meets given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(),
- WavesPerEU);
+ return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
}
/// \returns Maximum number of VGPRs that meets given number of waves per
/// execution unit requirement supported by the subtarget.
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(),
- WavesPerEU);
+ return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
}
/// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -949,6 +956,34 @@ public:
void getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
const override;
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const override {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+ }
};
class R600Subtarget final : public R600GenSubtargetInfo,
@@ -968,7 +1003,6 @@ private:
R600TargetLowering TLInfo;
InstrItineraryData InstrItins;
SelectionDAGTargetInfo TSInfo;
- AMDGPUAS AS;
public:
R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
@@ -1053,8 +1087,6 @@ public:
short getTexVTXClauseSize() const { return TexVTXClauseSize; }
- AMDGPUAS getAMDGPUAS() const { return AS; }
-
bool enableMachineScheduler() const override {
return true;
}
@@ -1062,6 +1094,34 @@ public:
bool enableSubRegLiveness() const override {
return true;
}
+
+ /// \returns Maximum number of work groups per compute unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum flat work group size supported by the subtarget.
+ unsigned getMinFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+ }
+
+ /// \returns Maximum flat work group size supported by the subtarget.
+ unsigned getMaxFlatWorkGroupSize() const override {
+ return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+ }
+
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget and limited by given \p FlatWorkGroupSize.
+ unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+ }
+
+ /// \returns Minimum number of waves per execution unit supported by the
+ /// subtarget.
+ unsigned getMinWavesPerEU() const override {
+ return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+ }
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2205819c444ff..e8cefdbf74b97 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -45,6 +45,7 @@
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Vectorize.h"
#include <memory>
@@ -105,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole(
cl::desc("Enable SDWA peepholer"),
cl::init(true));
+static cl::opt<bool> EnableDPPCombine(
+ "amdgpu-dpp-combine",
+ cl::desc("Enable DPP combiner"),
+ cl::init(false));
+
// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
@@ -137,6 +143,20 @@ static cl::opt<bool> EnableLowerKernelArguments(
cl::init(true),
cl::Hidden);
+// Enable atomic optimization
+static cl::opt<bool> EnableAtomicOptimizations(
+ "amdgpu-atomic-optimizations",
+ cl::desc("Enable atomic optimizations"),
+ cl::init(false),
+ cl::Hidden);
+
+// Enable Mode register optimization
+static cl::opt<bool> EnableSIModeRegisterPass(
+ "amdgpu-mode-register",
+ cl::desc("Enable mode register pass"),
+ cl::init(true),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -150,18 +170,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeR600VectorRegMergerPass(*PR);
initializeGlobalISel(*PR);
initializeAMDGPUDAGToDAGISelPass(*PR);
+ initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
+ initializeSIFixupVectorISelPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
initializeSIOptimizeExecMaskingPreRAPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
+ initializeAMDGPUFixFunctionBitcastsPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
+ initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
@@ -172,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
+ initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
@@ -182,6 +207,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIFormMemoryClausesPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
+ initializeAMDGPUExternalAAWrapperPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUInlinerPass(*PR);
@@ -292,12 +318,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
return Reloc::PIC_;
}
-static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
- if (CM)
- return *CM;
- return CodeModel::Small;
-}
-
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options,
@@ -306,9 +326,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OptLevel)
: LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
FS, Options, getEffectiveRelocModel(RM),
- getEffectiveCodeModel(CM), OptLevel),
+ getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
TLOF(createTLOF(getTargetTriple())) {
- AS = AMDGPU::getAMDGPUAS(TT);
initAsmInfo();
}
@@ -331,13 +350,6 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
FSAttr.getValueAsString();
}
-static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
- return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
- if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
- AAR.addAAResult(WrapperPass->getResult());
- });
-}
-
/// Predicate for Internalize pass.
static bool mustPreserveGV(const GlobalValue &GV) {
if (const Function *F = dyn_cast<Function>(&GV))
@@ -360,17 +372,6 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.Inliner = createAMDGPUFunctionInliningPass();
}
- if (Internalize) {
- // If we're generating code, we always have the whole program available. The
- // relocations expected for externally visible functions aren't supported,
- // so make sure every non-entry function is hidden.
- Builder.addExtension(
- PassManagerBuilder::EP_EnabledOnOptLevel0,
- [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
- PM.add(createInternalizePass(mustPreserveGV));
- });
- }
-
Builder.addExtension(
PassManagerBuilder::EP_ModuleOptimizerEarly,
[Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
@@ -613,20 +614,23 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
+ addPass(createAtomicExpandPass());
+
+ // This must occur before inlining, as the inliner will not look through
+ // bitcast calls.
+ addPass(createAMDGPUFixFunctionBitcastsPass());
+
addPass(createAMDGPULowerIntrinsicsPass());
- if (TM.getTargetTriple().getArch() == Triple::r600 ||
- !EnableAMDGPUFunctionCalls) {
- // Function calls are not supported, so make sure we inline everything.
- addPass(createAMDGPUAlwaysInlinePass());
- addPass(createAlwaysInlinerLegacyPass());
- // We need to add the barrier noop pass, otherwise adding the function
- // inlining pass will cause all of the PassConfigs passes to be run
- // one function at a time, which means if we have a nodule with two
- // functions, then we will generate code for the first function
- // without ever running any passes on the second.
- addPass(createBarrierNoopPass());
- }
+ // Function calls are not supported, so make sure we inline everything.
+ addPass(createAMDGPUAlwaysInlinePass());
+ addPass(createAlwaysInlinerLegacyPass());
+ // We need to add the barrier noop pass, otherwise adding the function
+ // inlining pass will cause all of the PassConfigs passes to be run
+ // one function at a time, which means if we have a nodule with two
+ // functions, then we will generate code for the first function
+ // without ever running any passes on the second.
+ addPass(createBarrierNoopPass());
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
// TODO: May want to move later or split into an early and late one.
@@ -690,6 +694,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
}
bool AMDGPUPassConfig::addPreISel() {
+ addPass(createLowerSwitchPass());
addPass(createFlattenCFGPass());
return false;
}
@@ -759,6 +764,10 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
+ if (EnableAtomicOptimizations) {
+ addPass(createAMDGPUAtomicOptimizerPass());
+ }
+
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
addPass(createAMDGPUAnnotateKernelFeaturesPass());
@@ -789,6 +798,8 @@ void GCNPassConfig::addMachineSSAOptimization() {
//
// XXX - Can we get away without running DeadMachineInstructionElim again?
addPass(&SIFoldOperandsID);
+ if (EnableDPPCombine)
+ addPass(&GCNDPPCombineID);
addPass(&DeadMachineInstructionElimID);
addPass(&SILoadStoreOptimizerID);
if (EnableSDWAPeephole) {
@@ -811,8 +822,10 @@ bool GCNPassConfig::addILPOpts() {
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
- addPass(createSILowerI1CopiesPass());
addPass(&SIFixSGPRCopiesID);
+ addPass(createSILowerI1CopiesPass());
+ addPass(createSIFixupVectorISelPass());
+ addPass(createSIAddIMGInitPass());
return false;
}
@@ -878,7 +891,8 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
void GCNPassConfig::addPostRegAlloc() {
addPass(&SIFixVGPRCopiesID);
- addPass(&SIOptimizeExecMaskingID);
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
}
@@ -889,6 +903,7 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
addPass(createSIShrinkInstructionsPass());
+ addPass(createSIModeRegisterPass());
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 0fe14493fabdd..62fbe71d19023 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -34,7 +34,6 @@ namespace llvm {
class AMDGPUTargetMachine : public LLVMTargetMachine {
protected:
std::unique_ptr<TargetLoweringObjectFile> TLOF;
- AMDGPUAS AS;
StringRef getGPUName(const Function &F) const;
StringRef getFeatureString(const Function &F) const;
@@ -55,16 +54,13 @@ public:
TargetLoweringObjectFile *getObjFileLowering() const override {
return TLOF.get();
}
- AMDGPUAS getAMDGPUAS() const {
- return AS;
- }
void adjustPassManager(PassManagerBuilder &) override;
+
/// Get the integer value of a null pointer in the given address space.
uint64_t getNullPointerValue(unsigned AddrSpace) const {
- if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS)
- return -1;
- return 0;
+ return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
}
};
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index e2f718bd3c34d..c4e1efde130b8 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -29,3 +29,13 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
}
+
+MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal(
+ const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
+ // Set metadata access for the explicit section
+ StringRef SectionName = GO->getSection();
+ if (SectionName.startswith(".AMDGPU.comment."))
+ SK = SectionKind::getMetadata();
+
+ return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index dd9dc1a88fc2b..a4ae1a2c18c26 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -26,6 +26,8 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
public:
MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
const TargetMachine &TM) const override;
+ MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+ const TargetMachine &TM) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a68b8d03f06e2..11e4ba4b5010d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -102,7 +102,6 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
- const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
unsigned LocalGEPsSeen = 0;
@@ -140,9 +139,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned AS = GEP->getAddressSpace();
unsigned Threshold = 0;
- if (AS == ASST.PRIVATE_ADDRESS)
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS)
Threshold = ThresholdPrivate;
- else if (AS == ASST.LOCAL_ADDRESS)
+ else if (AS == AMDGPUAS::LOCAL_ADDRESS)
Threshold = ThresholdLocal;
else
continue;
@@ -150,7 +149,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (UP.Threshold >= Threshold)
continue;
- if (AS == ASST.PRIVATE_ADDRESS) {
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
const Value *Ptr = GEP->getPointerOperand();
const AllocaInst *Alloca =
dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
@@ -160,7 +159,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;
- } else if (AS == ASST.LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
LocalGEPsSeen++;
// Inhibit unroll for local memory if we have seen addressing not to
// a variable, most likely we will be unable to combine it.
@@ -253,19 +252,18 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
}
unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
- AMDGPUAS AS = ST->getAMDGPUAS();
- if (AddrSpace == AS.GLOBAL_ADDRESS ||
- AddrSpace == AS.CONSTANT_ADDRESS ||
- AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
+ if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
+ AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
return 512;
}
- if (AddrSpace == AS.FLAT_ADDRESS ||
- AddrSpace == AS.LOCAL_ADDRESS ||
- AddrSpace == AS.REGION_ADDRESS)
+ if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
+ AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)
return 128;
- if (AddrSpace == AS.PRIVATE_ADDRESS)
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
llvm_unreachable("unhandled address space");
@@ -277,7 +275,7 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.
- if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
ChainSizeInBytes <= ST->getMaxPrivateElementSize();
}
@@ -545,14 +543,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
return !isArgPassedInSGPR(A);
- // Loads from the private address space are divergent, because threads
- // can execute the load instruction with the same inputs and get different
- // results.
+ // Loads from the private and flat address spaces are divergent, because
+ // threads can execute the load instruction with the same inputs and get
+ // different results.
//
// All other loads are not divergent, because if threads issue loads with the
// same arguments, they will always get the same result.
if (const LoadInst *Load = dyn_cast<LoadInst>(V))
- return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
+ return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+ Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
// Atomics are divergent because they are executed sequentially: when an
// atomic operation refers to the same address in each thread, then each
@@ -642,20 +641,19 @@ unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
}
unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
- AMDGPUAS AS = ST->getAMDGPUAS();
- if (AddrSpace == AS.GLOBAL_ADDRESS ||
- AddrSpace == AS.CONSTANT_ADDRESS)
+ if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
return 128;
- if (AddrSpace == AS.LOCAL_ADDRESS ||
- AddrSpace == AS.REGION_ADDRESS)
+ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)
return 64;
- if (AddrSpace == AS.PRIVATE_ADDRESS)
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
return 32;
- if ((AddrSpace == AS.PARAM_D_ADDRESS ||
- AddrSpace == AS.PARAM_I_ADDRESS ||
- (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
- AddrSpace <= AS.CONSTANT_BUFFER_15)))
+ if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+ AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+ (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+ AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
return 128;
llvm_unreachable("unhandled address space");
}
@@ -666,9 +664,7 @@ bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.
- if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
- return false;
- return true;
+ return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
}
bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 8e63d789e17d7..397c5c6fa6fbe 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -179,7 +179,7 @@ public:
if (IsGraphicsShader)
return -1;
return ST->hasFlatAddressSpace() ?
- ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE;
+ AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
}
unsigned getVectorSplitCost() { return 0; }
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 0d3a1673696a5..ced3f6f567e2f 100644
--- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -25,7 +25,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -70,7 +70,7 @@ char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
@@ -78,10 +78,10 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
// TODO: Preserve dominator tree.
AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
// No divergent values are changed, only blocks and branch edges.
- AU.addPreserved<DivergenceAnalysis>();
+ AU.addPreserved<LegacyDivergenceAnalysis>();
// We preserve the non-critical-edgeness property
AU.addPreservedID(BreakCriticalEdgesID);
@@ -95,7 +95,7 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
/// \returns true if \p BB is reachable through only uniform branches.
/// XXX - Is there a more efficient way to find this?
-static bool isUniformlyReached(const DivergenceAnalysis &DA,
+static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
BasicBlock &BB) {
SmallVector<BasicBlock *, 8> Stack;
SmallPtrSet<BasicBlock *, 8> Visited;
@@ -163,7 +163,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
if (PDT.getRoots().size() <= 1)
return false;
- DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+ LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 31e2885c833d9..3f9af27a2e5e1 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -49,6 +49,7 @@
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/TargetParser.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
@@ -156,13 +157,12 @@ public:
ImmTyDMask,
ImmTyUNorm,
ImmTyDA,
- ImmTyR128,
+ ImmTyR128A16,
ImmTyLWE,
ImmTyExpTgt,
ImmTyExpCompr,
ImmTyExpVM,
- ImmTyDFMT,
- ImmTyNFMT,
+ ImmTyFORMAT,
ImmTyHwreg,
ImmTyOff,
ImmTySendMsg,
@@ -291,7 +291,7 @@ public:
bool isDMask() const { return isImmTy(ImmTyDMask); }
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
- bool isR128() const { return isImmTy(ImmTyR128); }
+ bool isR128A16() const { return isImmTy(ImmTyR128A16); }
bool isLWE() const { return isImmTy(ImmTyLWE); }
bool isOff() const { return isImmTy(ImmTyOff); }
bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -312,8 +312,7 @@ public:
bool isSLC() const { return isImmTy(ImmTySLC); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
- bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
- bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
+ bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -666,8 +665,7 @@ public:
case ImmTySLC: OS << "SLC"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
- case ImmTyDFMT: OS << "DFMT"; break;
- case ImmTyNFMT: OS << "NFMT"; break;
+ case ImmTyFORMAT: OS << "FORMAT"; break;
case ImmTyClampSI: OS << "ClampSI"; break;
case ImmTyOModSI: OS << "OModSI"; break;
case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -681,7 +679,7 @@ public:
case ImmTyDMask: OS << "DMask"; break;
case ImmTyUNorm: OS << "UNorm"; break;
case ImmTyDA: OS << "DA"; break;
- case ImmTyR128: OS << "R128"; break;
+ case ImmTyR128A16: OS << "R128A16"; break;
case ImmTyLWE: OS << "LWE"; break;
case ImmTyOff: OS << "Off"; break;
case ImmTyExpTgt: OS << "ExpTgt"; break;
@@ -920,8 +918,7 @@ public:
// Currently there is none suitable machinery in the core llvm-mc for this.
// MCSymbol::isRedefinable is intended for another purpose, and
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
- AMDGPU::IsaInfo::IsaVersion ISA =
- AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
MCSymbol *Sym =
@@ -1061,6 +1058,7 @@ public:
OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
+ OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands);
void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
@@ -1092,7 +1090,6 @@ private:
bool validateMIMGAtomicDMask(const MCInst &Inst);
bool validateMIMGGatherDMask(const MCInst &Inst);
bool validateMIMGDataSize(const MCInst &Inst);
- bool validateMIMGR128(const MCInst &Inst);
bool validateMIMGD16(const MCInst &Inst);
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -1829,7 +1826,7 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
unsigned DwordRegIndex,
unsigned RegWidth) {
// Symbols are only defined for GCN targets
- if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6)
+ if (AMDGPU::getIsaVersion(getSTI().getCPU()).Major < 6)
return true;
auto SymbolName = getGprCountSymbolName(RegKind);
@@ -2447,22 +2444,6 @@ bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) {
return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8;
}
-bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) {
-
- const unsigned Opc = Inst.getOpcode();
- const MCInstrDesc &Desc = MII.get(Opc);
-
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
- return true;
-
- int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
- assert(Idx != -1);
-
- bool R128 = (Inst.getOperand(Idx).getImm() != 0);
-
- return !R128 || hasMIMG_R128();
-}
-
bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
@@ -2497,11 +2478,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"integer clamping is not supported on this GPU");
return false;
}
- if (!validateMIMGR128(Inst)) {
- Error(IDLoc,
- "r128 modifier is not supported on this GPU");
- return false;
- }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(IDLoc,
@@ -2661,18 +2637,18 @@ bool AMDGPUAsmParser::calculateGPRBlocks(
unsigned &SGPRBlocks) {
// TODO(scott.linder): These calculations are duplicated from
// AMDGPUAsmPrinter::getSIProgramInfo and could be unified.
- IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features);
+ IsaVersion Version = getIsaVersion(getSTI().getCPU());
unsigned NumVGPRs = NextFreeVGPR;
unsigned NumSGPRs = NextFreeSGPR;
- unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features);
+ unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI());
if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) &&
NumSGPRs > MaxAddressableNumSGPRs)
return OutOfRangeError(SGPRRange);
NumSGPRs +=
- IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed);
+ IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed);
if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) &&
NumSGPRs > MaxAddressableNumSGPRs)
@@ -2681,8 +2657,8 @@ bool AMDGPUAsmParser::calculateGPRBlocks(
if (Features.test(FeatureSGPRInitBug))
NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
- VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs);
- SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs);
+ VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs);
+ SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs);
return false;
}
@@ -2702,8 +2678,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
StringSet<> Seen;
- IsaInfo::IsaVersion IVersion =
- IsaInfo::getIsaVersion(getSTI().getFeatureBits());
+ IsaVersion IVersion = getIsaVersion(getSTI().getCPU());
SMRange VGPRRange;
uint64_t NextFreeVGPR = 0;
@@ -2962,8 +2937,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
// If this directive has no arguments, then use the ISA version for the
// targeted GPU.
if (getLexer().is(AsmToken::EndOfStatement)) {
- AMDGPU::IsaInfo::IsaVersion ISA =
- AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
ISA.Stepping,
"AMD", "AMDGPU");
@@ -3025,7 +2999,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
amd_kernel_code_t Header;
- AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits());
+ AMDGPU::initDefaultAMDKernelCodeT(Header, &getSTI());
while (true) {
// Lex EndOfStatement. This is in a while loop, because lexing a comment
@@ -3091,9 +3065,18 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
}
bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
+ const char *AssemblerDirectiveBegin;
+ const char *AssemblerDirectiveEnd;
+ std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
+ AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())
+ ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
+ HSAMD::V3::AssemblerDirectiveEnd)
+ : std::make_tuple(HSAMD::AssemblerDirectiveBegin,
+ HSAMD::AssemblerDirectiveEnd);
+
if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
return Error(getParser().getTok().getLoc(),
- (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is "
+ (Twine(AssemblerDirectiveBegin) + Twine(" directive is "
"not available on non-amdhsa OSes")).str());
}
@@ -3111,7 +3094,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
if (getLexer().is(AsmToken::Identifier)) {
StringRef ID = getLexer().getTok().getIdentifier();
- if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) {
+ if (ID == AssemblerDirectiveEnd) {
Lex();
FoundEnd = true;
break;
@@ -3133,8 +3116,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
YamlStream.flush();
- if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString))
- return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ if (IsaInfo::hasCodeObjectV3(&getSTI())) {
+ if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ } else {
+ if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
+ return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
+ }
return false;
}
@@ -3171,6 +3159,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".amdhsa_kernel")
return ParseDirectiveAMDHSAKernel();
+
+ // TODO: Restructure/combine with PAL metadata directive.
+ if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
+ return ParseDirectiveHSAMetadata();
} else {
if (IDVal == ".hsa_code_object_version")
return ParseDirectiveHSACodeObjectVersion();
@@ -3186,10 +3178,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".amd_amdgpu_isa")
return ParseDirectiveISAVersion();
- }
- if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
- return ParseDirectiveHSAMetadata();
+ if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
+ return ParseDirectiveHSAMetadata();
+ }
if (IDVal == PALMD::AssemblerDirective)
return ParseDirectivePALMetadata();
@@ -3465,6 +3457,10 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
case AsmToken::Identifier: {
StringRef Tok = Parser.getTok().getString();
if (Tok == Name) {
+ if (Tok == "r128" && isGFX9())
+ Error(S, "r128 modifier is not supported on this GPU");
+ if (Tok == "a16" && !isGFX9())
+ Error(S, "a16 modifier is not supported on this GPU");
Bit = 1;
Parser.Lex();
} else if (Tok.startswith("no") && Tok.endswith(Name)) {
@@ -3522,6 +3518,53 @@ AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
return MatchOperand_Success;
}
+// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
+// values to live in a joint format operand in the MCInst encoding.
+OperandMatchResultTy
+AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) {
+ SMLoc S = Parser.getTok().getLoc();
+ int64_t Dfmt = 0, Nfmt = 0;
+ // dfmt and nfmt can appear in either order, and each is optional.
+ bool GotDfmt = false, GotNfmt = false;
+ while (!GotDfmt || !GotNfmt) {
+ if (!GotDfmt) {
+ auto Res = parseIntWithPrefix("dfmt", Dfmt);
+ if (Res != MatchOperand_NoMatch) {
+ if (Res != MatchOperand_Success)
+ return Res;
+ if (Dfmt >= 16) {
+ Error(Parser.getTok().getLoc(), "out of range dfmt");
+ return MatchOperand_ParseFail;
+ }
+ GotDfmt = true;
+ Parser.Lex();
+ continue;
+ }
+ }
+ if (!GotNfmt) {
+ auto Res = parseIntWithPrefix("nfmt", Nfmt);
+ if (Res != MatchOperand_NoMatch) {
+ if (Res != MatchOperand_Success)
+ return Res;
+ if (Nfmt >= 8) {
+ Error(Parser.getTok().getLoc(), "out of range nfmt");
+ return MatchOperand_ParseFail;
+ }
+ GotNfmt = true;
+ Parser.Lex();
+ continue;
+ }
+ }
+ break;
+ }
+ if (!GotDfmt && !GotNfmt)
+ return MatchOperand_NoMatch;
+ auto Format = Dfmt | Nfmt << 4;
+ Operands.push_back(
+ AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT));
+ return MatchOperand_Success;
+}
+
//===----------------------------------------------------------------------===//
// ds
//===----------------------------------------------------------------------===//
@@ -3652,12 +3695,12 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
static bool
encodeCnt(
- const AMDGPU::IsaInfo::IsaVersion ISA,
+ const AMDGPU::IsaVersion ISA,
int64_t &IntVal,
int64_t CntVal,
bool Saturate,
- unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned),
- unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned))
+ unsigned (*encode)(const IsaVersion &Version, unsigned, unsigned),
+ unsigned (*decode)(const IsaVersion &Version, unsigned))
{
bool Failed = false;
@@ -3688,8 +3731,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
if (getParser().parseAbsoluteExpression(CntVal))
return true;
- AMDGPU::IsaInfo::IsaVersion ISA =
- AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
bool Failed = true;
bool Sat = CntName.endswith("_sat");
@@ -3724,8 +3766,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
OperandMatchResultTy
AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
- AMDGPU::IsaInfo::IsaVersion ISA =
- AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
int64_t Waitcnt = getWaitcntBitMask(ISA);
SMLoc S = Parser.getTok().getLoc();
@@ -4617,8 +4658,7 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyOffset);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
@@ -4661,7 +4701,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
@@ -4761,8 +4801,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"lds", AMDGPUOperand::ImmTyLDS, true, nullptr},
{"offset", AMDGPUOperand::ImmTyOffset, false, nullptr},
{"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr},
- {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr},
- {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr},
+ {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -4772,7 +4811,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
{"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr},
{"da", AMDGPUOperand::ImmTyDA, true, nullptr},
- {"r128", AMDGPUOperand::ImmTyR128, true, nullptr},
+ {"r128", AMDGPUOperand::ImmTyR128A16, true, nullptr},
+ {"a16", AMDGPUOperand::ImmTyR128A16, true, nullptr},
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
@@ -4844,6 +4884,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands)
Op.Type == AMDGPUOperand::ImmTyNegHi) {
res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
Op.ConvertResult);
+ } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) {
+ res = parseDfmtNfmt(Operands);
} else {
res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
}
@@ -5251,12 +5293,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
}
- // All DPP instructions with at least one source operand have a fake "old"
- // source at the beginning that's tied to the dst operand. Handle it here.
- if (Desc.getNumOperands() >= 2)
- Inst.addOperand(Inst.getOperand(0));
-
for (unsigned E = Operands.size(); I != E; ++I) {
+ auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
+ MCOI::TIED_TO);
+ if (TiedTo != -1) {
+ assert((unsigned)TiedTo < Inst.getNumOperands());
+ // handle tied old or src2 for MAC instructions
+ Inst.addOperand(Inst.getOperand(TiedTo));
+ }
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index b87c47a6b9eea..51c2abeac2ffb 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -17,14 +17,12 @@ def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [],
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
-def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
class MubufLoad <SDPatternOperator op> : PatFrag <
(ops node:$ptr), (op node:$ptr), [{
auto const AS = cast<MemSDNode>(N)->getAddressSpace();
- return AS == AMDGPUASI.GLOBAL_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS;
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
}]>;
def mubuf_load : MubufLoad <load>;
@@ -100,15 +98,11 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_vaddr = 1;
bits<1> has_glc = 1;
bits<1> glc_value = 0; // the value for glc if no such operand
- bits<4> dfmt_value = 1; // the value for dfmt if no such operand
- bits<3> nfmt_value = 0; // the value for nfmt if no such operand
bits<1> has_srsrc = 1;
bits<1> has_soffset = 1;
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
- bits<1> has_dfmt = 1;
- bits<1> has_nfmt = 1;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -126,14 +120,16 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
bits<12> offset;
bits<1> glc;
- bits<4> dfmt;
- bits<3> nfmt;
+ bits<7> format;
bits<8> vaddr;
bits<8> vdata;
bits<7> srsrc;
bits<1> slc;
bits<1> tfe;
bits<8> soffset;
+
+ bits<4> dfmt = format{3-0};
+ bits<3> nfmt = format{6-4};
}
class getMTBUFInsDA<list<RegisterClass> vdataList,
@@ -142,16 +138,16 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe),
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe)
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+ SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
SLC:$slc, TFE:$tfe),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
- SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+ SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
SLC:$slc, TFE:$tfe)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
@@ -169,15 +165,15 @@ class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
class getMTBUFAsmOps<int addrKind> {
string Pfx =
- !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+ !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset",
!if(!eq(addrKind, BUFAddrKind.OffEn),
- "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+ "$vaddr, $srsrc, $format, $soffset offen",
!if(!eq(addrKind, BUFAddrKind.IdxEn),
- "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+ "$vaddr, $srsrc, $format, $soffset idxen",
!if(!eq(addrKind, BUFAddrKind.BothEn),
- "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+ "$vaddr, $srsrc, $format, $soffset idxen offen",
!if(!eq(addrKind, BUFAddrKind.Addr64),
- "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+ "$vaddr, $srsrc, $format, $soffset addr64",
"")))));
string ret = Pfx # "$offset";
}
@@ -217,14 +213,14 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set load_vt:$vdata,
- (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
- i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
+ i1:$glc, i1:$slc, i1:$tfe)))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+ i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>,
MTBUFAddr64Table<1, NAME>;
def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
@@ -263,13 +259,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+ i16:$offset, i8:$format, i1:$glc,
i1:$slc, i1:$tfe))]>,
MTBUFAddr64Table<0, NAME>;
def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+ i16:$offset, i8:$format, i1:$glc,
i1:$slc, i1:$tfe))]>,
MTBUFAddr64Table<1, NAME>;
@@ -290,6 +286,12 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
// MUBUF classes
//===----------------------------------------------------------------------===//
+class MUBUFGetBaseOpcode<string Op> {
+ string ret = !subst("DWORDX2", "DWORD",
+ !subst("DWORDX3", "DWORD",
+ !subst("DWORDX4", "DWORD", Op)));
+}
+
class MUBUF_Pseudo <string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
InstSI<outs, ins, "", pattern>,
@@ -303,6 +305,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
string Mnemonic = opName;
string AsmOperands = asmOps;
+ Instruction Opcode = !cast<Instruction>(NAME);
+ Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
+
let VM_CNT = 1;
let EXP_CNT = 1;
let MUBUF = 1;
@@ -325,6 +330,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
+ bits<4> dwords = 0;
}
class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
@@ -398,6 +404,16 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
);
}
+class getMUBUFDwords<RegisterClass regClass> {
+ string regClassAsInt = !cast<string>(regClass);
+ int ret =
+ !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
+ !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
+ !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
+ !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
+ 0))));
+}
+
class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret,
@@ -458,6 +474,7 @@ class MUBUF_Load_Pseudo <string opName,
let Uses = !if(isLds, [EXEC, M0], [EXEC]);
let has_tfe = !if(isLds, 0, 1);
let lds = isLds;
+ let dwords = getMUBUFDwords<vdataClass>.ret;
}
// FIXME: tfe can't be an operand because it requires a separate
@@ -521,6 +538,7 @@ class MUBUF_Store_Pseudo <string opName,
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
+ let dwords = getMUBUFDwords<vdataClass>.ret;
}
multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
@@ -660,11 +678,10 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
let AsmMatchConverter = "cvtMubufAtomicReturn";
}
-multiclass MUBUF_Pseudo_Atomics <string opName,
- RegisterClass vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic> {
-
+multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
+ RegisterClass vdataClass,
+ ValueType vdataType,
+ SDPatternOperator atomic> {
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
MUBUFAddr64Table <0, NAME>;
def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
@@ -672,7 +689,12 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
+multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
+ RegisterClass vdataClass,
+ ValueType vdataType,
+ SDPatternOperator atomic> {
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
@@ -690,6 +712,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
}
+multiclass MUBUF_Pseudo_Atomics <string opName,
+ RegisterClass vdataClass,
+ ValueType vdataType,
+ SDPatternOperator atomic> :
+ MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType, atomic>,
+ MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+
//===----------------------------------------------------------------------===//
// MUBUF Instructions
@@ -1030,6 +1059,14 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
// MUBUF Patterns
//===----------------------------------------------------------------------===//
+def extract_glc : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_slc : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@@ -1037,119 +1074,129 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc)),
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc)),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm)),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc)),
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
}
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">;
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc),
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$glc, imm:$slc),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), 0)
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), 0)
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+ (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$glc, imm:$slc),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (as_i1imm $glc), (as_i1imm $slc), 0)
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
}
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">;
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
//===----------------------------------------------------------------------===//
// buffer_atomic patterns
@@ -1158,36 +1205,36 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $slc))
+ (as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $slc))
+ (as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (as_i1imm $slc))
+ (as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+ $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
>;
}
@@ -1205,49 +1252,49 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
sub0)
>;
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
- imm:$slc),
+ 0, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
sub0)
>;
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
sub0)
>;
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
- imm:$slc),
+ i32:$voffset, i32:$soffset, imm:$offset,
+ imm:$cachepolicy, imm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+ $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
sub0)
>;
@@ -1397,54 +1444,6 @@ defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
}
-
-// BUFFER_LOAD_DWORD*, addr64=0
-multiclass MUBUF_Load_Dword <ValueType vt,
- MUBUF_Pseudo offset,
- MUBUF_Pseudo offen,
- MUBUF_Pseudo idxen,
- MUBUF_Pseudo bothen> {
-
- def : GCNPat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
- imm:$offset, 0, 0, imm:$glc, imm:$slc,
- imm:$tfe)),
- (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), (as_i1imm $tfe))
- >;
-
- def : GCNPat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
- imm:$offset, 1, 0, imm:$glc, imm:$slc,
- imm:$tfe)),
- (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $tfe))
- >;
-
- def : GCNPat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
- imm:$offset, 0, 1, imm:$glc, imm:$slc,
- imm:$tfe)),
- (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
- (as_i1imm $slc), (as_i1imm $tfe))
- >;
-
- def : GCNPat <
- (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
- imm:$offset, 1, 1, imm:$glc, imm:$slc,
- imm:$tfe)),
- (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
- (as_i1imm $tfe))
- >;
-}
-
-defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
- BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
-defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
- BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
-defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
- BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
-
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is forst
@@ -1524,32 +1523,36 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ imm:$format, imm:$cachepolicy, imm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ imm:$format, imm:$cachepolicy, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+ imm:$format, imm:$cachepolicy, imm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
}
@@ -1576,39 +1579,36 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
- (as_i16imm $offset), (as_i8imm $dfmt),
- (as_i8imm $nfmt), (as_i1imm $glc),
- (as_i1imm $slc), 0)
+ (as_i16imm $offset), (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ imm:$format, imm:$cachepolicy, imm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (as_i8imm $dfmt),
- (as_i8imm $nfmt), (as_i1imm $glc),
- (as_i1imm $slc), 0)
+ (as_i16imm $offset), (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ imm:$format, imm:$cachepolicy, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (as_i8imm $dfmt),
- (as_i8imm $nfmt), (as_i1imm $glc),
- (as_i1imm $slc), 0)
+ (as_i16imm $offset), (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
- imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+ imm:$offset, imm:$format, imm:$cachepolicy, imm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+ $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
+ (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0)
>;
}
@@ -1781,8 +1781,8 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{15} = ps.addr64;
let Inst{18-16} = op;
- let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
- let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -1811,6 +1811,7 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
//===----------------------------------------------------------------------===//
// CI
+// MTBUF - GFX6, GFX7.
//===----------------------------------------------------------------------===//
class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
@@ -2013,8 +2014,8 @@ class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{18-15} = op;
- let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
- let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -2043,8 +2044,8 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> :
let Inst{13} = ps.idxen;
let Inst{14} = !if(ps.has_glc, glc, ps.glc_value);
let Inst{18-15} = op;
- let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
- let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
let Inst{31-26} = 0x3a; //encoding
let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
@@ -2089,3 +2090,22 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>;
defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>;
} // End HasUnpackedD16VMem.
+
+def MUBUFInfoTable : GenericTable {
+ let FilterClass = "MUBUF_Pseudo";
+ let CppTypeName = "MUBUFInfo";
+ let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getMUBUFOpcodeHelper";
+}
+
+def getMUBUFInfoFromOpcode : SearchIndex {
+ let Table = MUBUFInfoTable;
+ let Key = ["Opcode"];
+}
+
+def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+ let Table = MUBUFInfoTable;
+ let Key = ["BaseOpcode", "dwords"];
+}
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 174b2df15300a..393311791ec93 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -37,8 +37,10 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAnnotateUniformValues.cpp
AMDGPUArgumentUsageInfo.cpp
AMDGPUAsmPrinter.cpp
+ AMDGPUAtomicOptimizer.cpp
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
+ AMDGPUFixFunctionBitcasts.cpp
AMDGPUFrameLowering.cpp
AMDGPUHSAMetadataStreamer.cpp
AMDGPUInstrInfo.cpp
@@ -91,9 +93,11 @@ add_llvm_target(AMDGPUCodeGen
R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp
R600RegisterInfo.cpp
+ SIAddIMGInit.cpp
SIAnnotateControlFlow.cpp
SIDebuggerInsertNops.cpp
SIFixSGPRCopies.cpp
+ SIFixupVectorISel.cpp
SIFixVGPRCopies.cpp
SIFixWWMLiveness.cpp
SIFoldOperands.cpp
@@ -116,6 +120,8 @@ add_llvm_target(AMDGPUCodeGen
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
GCNILPSched.cpp
+ GCNDPPCombine.cpp
+ SIModeRegister.cpp
)
add_subdirectory(AsmParser)
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index cdc6ab9412e61..31d2ebef481d2 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -728,7 +728,9 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
(i1 0))
>;
-let OtherPredicates = [LDSRequiresM0Init] in {
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+let OtherPredicates = [LDSRequiresM0Init, isCIVI] in {
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
}
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 3ef473b7fd966..44040d352e6a8 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -121,6 +121,11 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
}
+class GlobalSaddrTable <bit is_saddr, string Name = ""> {
+ bit IsSaddr = is_saddr;
+ string SaddrOp = Name;
+}
+
// TODO: Is exec allowed for saddr? The disabled value 0x7f is the
// same encoding value as exec_hi, so it isn't possible to use that if
// saddr is 32-bit (which isn't handled here yet).
@@ -171,15 +176,19 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
let is_flat_global = 1 in {
- def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
- def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+ def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+ GlobalSaddrTable<1, opName>;
}
}
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
let is_flat_global = 1 in {
- def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
- def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+ def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+ GlobalSaddrTable<0, opName>;
+ def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+ GlobalSaddrTable<1, opName>;
}
}
@@ -262,6 +271,7 @@ multiclass FLAT_Atomic_Pseudo<
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc),
" $vaddr, $vdata$offset$slc">,
+ GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
}
@@ -272,10 +282,11 @@ multiclass FLAT_Atomic_Pseudo<
" $vdst, $vaddr, $vdata$offset glc$slc",
[(set vt:$vdst,
(atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1>;
}
-multiclass FLAT_Global_Atomic_Pseudo<
+multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
string opName,
RegisterClass vdst_rc,
ValueType vt,
@@ -287,35 +298,48 @@ multiclass FLAT_Global_Atomic_Pseudo<
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, off$offset$slc">,
+ GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let has_saddr = 1;
let PseudoInstr = NAME;
}
- def _RTN : FLAT_AtomicRet_Pseudo <opName,
- (outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
- " $vdst, $vaddr, $vdata, off$offset glc$slc",
- [(set vt:$vdst,
- (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
- AtomicNoRet <opName, 1> {
- let has_saddr = 1;
- }
-
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vaddr, $vdata, $saddr$offset$slc">,
+ GlobalSaddrTable<1, opName>,
AtomicNoRet <opName#"_saddr", 0> {
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR";
}
+}
+
+multiclass FLAT_Global_Atomic_Pseudo_RTN<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ SDPatternOperator atomic = null_frag,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc> {
+
+ def _RTN : FLAT_AtomicRet_Pseudo <opName,
+ (outs vdst_rc:$vdst),
+ (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+ " $vdst, $vaddr, $vdata, off$offset glc$slc",
+ [(set vt:$vdst,
+ (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+ GlobalSaddrTable<0, opName#"_rtn">,
+ AtomicNoRet <opName, 1> {
+ let has_saddr = 1;
+ }
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
(ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
" $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+ GlobalSaddrTable<1, opName#"_rtn">,
AtomicNoRet <opName#"_saddr", 1> {
let has_saddr = 1;
let enabled_saddr = 1;
@@ -323,10 +347,20 @@ multiclass FLAT_Global_Atomic_Pseudo<
}
}
+multiclass FLAT_Global_Atomic_Pseudo<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ SDPatternOperator atomic = null_frag,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc> :
+ FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>,
+ FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>;
+
class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}]
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
>;
def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
new file mode 100644
index 0000000000000..56071d0d23744
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -0,0 +1,446 @@
+//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
+// operand.If any of the use instruction cannot be combined with the mov the
+// whole sequence is reverted.
+//
+// $old = ...
+// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
+// dpp_controls..., $bound_ctrl
+// $res = VALU $dpp_value, ...
+//
+// to
+//
+// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
+// dpp_controls..., $folded_bound_ctrl
+//
+// Combining rules :
+//
+// $bound_ctrl is DPP_BOUND_ZERO, $old is any
+// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+//
+// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
+// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+//
+// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
+// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+//
+// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-dpp-combine"
+
+STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
+
+namespace {
+
+class GCNDPPCombine : public MachineFunctionPass {
+ MachineRegisterInfo *MRI;
+ const SIInstrInfo *TII;
+
+ using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+
+ MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
+
+ RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
+ RegSubRegPair OldOpndVGPR,
+ MachineOperand &OldOpndValue) const;
+
+ MachineInstr *createDPPInst(MachineInstr &OrigMI,
+ MachineInstr &MovMI,
+ RegSubRegPair OldOpndVGPR,
+ MachineOperand *OldOpnd,
+ bool BoundCtrlZero) const;
+
+ MachineInstr *createDPPInst(MachineInstr &OrigMI,
+ MachineInstr &MovMI,
+ RegSubRegPair OldOpndVGPR,
+ bool BoundCtrlZero) const;
+
+ bool hasNoImmOrEqual(MachineInstr &MI,
+ unsigned OpndName,
+ int64_t Value,
+ int64_t Mask = -1) const;
+
+ bool combineDPPMov(MachineInstr &MI) const;
+
+public:
+ static char ID;
+
+ GCNDPPCombine() : MachineFunctionPass(ID) {
+ initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "GCN DPP Combine"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
+
+char GCNDPPCombine::ID = 0;
+
+char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
+
+FunctionPass *llvm::createGCNDPPCombinePass() {
+ return new GCNDPPCombine();
+}
+
+static int getDPPOp(unsigned Op) {
+ auto DPP32 = AMDGPU::getDPPOp32(Op);
+ if (DPP32 != -1)
+ return DPP32;
+
+ auto E32 = AMDGPU::getVOPe32(Op);
+ return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
+}
+
+// tracks the register operand definition and returns:
+// 1. immediate operand used to initialize the register if found
+// 2. nullptr if the register operand is undef
+// 3. the operand itself otherwise
+MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
+ auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
+ if (!Def)
+ return nullptr;
+
+ switch(Def->getOpcode()) {
+ default: break;
+ case AMDGPU::IMPLICIT_DEF:
+ return nullptr;
+ case AMDGPU::COPY:
+ case AMDGPU::V_MOV_B32_e32: {
+ auto &Op1 = Def->getOperand(1);
+ if (Op1.isImm())
+ return &Op1;
+ break;
+ }
+ }
+ return &OldOpnd;
+}
+
+MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
+ MachineInstr &MovMI,
+ RegSubRegPair OldOpndVGPR,
+ bool BoundCtrlZero) const {
+ assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
+
+ auto OrigOp = OrigMI.getOpcode();
+ auto DPPOp = getDPPOp(OrigOp);
+ if (DPPOp == -1) {
+ LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
+ return nullptr;
+ }
+
+ auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
+ OrigMI.getDebugLoc(), TII->get(DPPOp));
+ bool Fail = false;
+ do {
+ auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
+ assert(Dst);
+ DPPInst.add(*Dst);
+ int NumOperands = 1;
+
+ const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
+ if (OldIdx != -1) {
+ assert(OldIdx == NumOperands);
+ assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
+ DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
+ ++NumOperands;
+ }
+
+ if (auto *Mod0 = TII->getNamedOperand(OrigMI,
+ AMDGPU::OpName::src0_modifiers)) {
+ assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
+ AMDGPU::OpName::src0_modifiers));
+ assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+ DPPInst.addImm(Mod0->getImm());
+ ++NumOperands;
+ }
+ auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
+ assert(Src0);
+ if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
+ LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
+ Fail = true;
+ break;
+ }
+ DPPInst.add(*Src0);
+ ++NumOperands;
+
+ if (auto *Mod1 = TII->getNamedOperand(OrigMI,
+ AMDGPU::OpName::src1_modifiers)) {
+ assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
+ AMDGPU::OpName::src1_modifiers));
+ assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+ DPPInst.addImm(Mod1->getImm());
+ ++NumOperands;
+ }
+ if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+ LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
+ Fail = true;
+ break;
+ }
+ DPPInst.add(*Src1);
+ ++NumOperands;
+ }
+
+ if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
+ if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
+ LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
+ Fail = true;
+ break;
+ }
+ DPPInst.add(*Src2);
+ }
+
+ DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
+ DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
+ DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
+ DPPInst.addImm(BoundCtrlZero ? 1 : 0);
+ } while (false);
+
+ if (Fail) {
+ DPPInst.getInstr()->eraseFromParent();
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
+ return DPPInst.getInstr();
+}
+
+GCNDPPCombine::RegSubRegPair
+GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
+ RegSubRegPair OldOpndVGPR,
+ MachineOperand &OldOpndValue) const {
+ assert(OldOpndValue.isImm());
+ switch (OrigMI.getOpcode()) {
+ default: break;
+ case AMDGPU::V_MAX_U32_e32:
+ if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
+ return OldOpndVGPR;
+ break;
+ case AMDGPU::V_MAX_I32_e32:
+ if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
+ return OldOpndVGPR;
+ break;
+ case AMDGPU::V_MIN_I32_e32:
+ if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
+ return OldOpndVGPR;
+ break;
+
+ case AMDGPU::V_MUL_I32_I24_e32:
+ case AMDGPU::V_MUL_U32_U24_e32:
+ if (OldOpndValue.getImm() == 1) {
+ auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+ assert(Src1 && Src1->isReg());
+ return getRegSubRegPair(*Src1);
+ }
+ break;
+ }
+ return RegSubRegPair();
+}
+
+// Cases to combine:
+// $bound_ctrl is DPP_BOUND_ZERO, $old is any
+// $bound_ctrl is DPP_BOUND_OFF, $old is 0
+// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
+
+// $bound_ctrl is DPP_BOUND_OFF, $old is undef
+// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
+
+// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
+// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
+
+MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
+ MachineInstr &MovMI,
+ RegSubRegPair OldOpndVGPR,
+ MachineOperand *OldOpndValue,
+ bool BoundCtrlZero) const {
+ assert(OldOpndVGPR.Reg);
+ if (!BoundCtrlZero && OldOpndValue) {
+ assert(OldOpndValue->isImm());
+ OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
+ if (!OldOpndVGPR.Reg) {
+ LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
+ return nullptr;
+ }
+ }
+ return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
+}
+
+// returns true if MI doesn't have OpndName immediate operand or the
+// operand has Value
+bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
+ int64_t Value, int64_t Mask) const {
+ auto *Imm = TII->getNamedOperand(MI, OpndName);
+ if (!Imm)
+ return true;
+
+ assert(Imm->isImm());
+ return (Imm->getImm() & Mask) == Value;
+}
+
+bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
+ assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
+ auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
+ assert(BCZOpnd && BCZOpnd->isImm());
+ bool BoundCtrlZero = 0 != BCZOpnd->getImm();
+
+ LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
+
+ auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
+ assert(OldOpnd && OldOpnd->isReg());
+ auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
+ auto *OldOpndValue = getOldOpndValue(*OldOpnd);
+ assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
+ if (OldOpndValue) {
+ if (BoundCtrlZero) {
+ OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
+ OldOpndValue = nullptr;
+ } else {
+ if (!OldOpndValue->isImm()) {
+ LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
+ return false;
+ }
+ if (OldOpndValue->getImm() == 0) {
+ OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
+ OldOpndValue = nullptr;
+ BoundCtrlZero = true;
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << " old=";
+ if (!OldOpndValue)
+ dbgs() << "undef";
+ else
+ dbgs() << OldOpndValue->getImm();
+ dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
+
+ std::vector<MachineInstr*> OrigMIs, DPPMIs;
+ if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
+ OldOpndVGPR = RegSubRegPair(
+ MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
+ auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
+ TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
+ DPPMIs.push_back(UndefInst.getInstr());
+ }
+
+ OrigMIs.push_back(&MovMI);
+ bool Rollback = true;
+ for (auto &Use : MRI->use_nodbg_operands(
+ TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
+ Rollback = true;
+
+ auto &OrigMI = *Use.getParent();
+ auto OrigOp = OrigMI.getOpcode();
+ if (TII->isVOP3(OrigOp)) {
+ if (!TII->hasVALU32BitEncoding(OrigOp)) {
+ LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
+ break;
+ }
+ // check if other than abs|neg modifiers are set (opsel for example)
+ const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
+ if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
+ !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
+ !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
+ !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
+ LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
+ break;
+ }
+ } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
+ LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
+ if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+ if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
+ OldOpndValue, BoundCtrlZero)) {
+ DPPMIs.push_back(DPPInst);
+ Rollback = false;
+ }
+ } else if (OrigMI.isCommutable() &&
+ &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ auto *BB = OrigMI.getParent();
+ auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
+ BB->insert(OrigMI, NewMI);
+ if (TII->commuteInstruction(*NewMI)) {
+ LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
+ if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
+ OldOpndValue, BoundCtrlZero)) {
+ DPPMIs.push_back(DPPInst);
+ Rollback = false;
+ }
+ } else
+ LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
+ NewMI->eraseFromParent();
+ } else
+ LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
+ if (Rollback)
+ break;
+ OrigMIs.push_back(&OrigMI);
+ }
+
+ for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
+ MI->eraseFromParent();
+
+ return !Rollback;
+}
+
+bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
+ auto &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasDPP() || skipFunction(MF.getFunction()))
+ return false;
+
+ MRI = &MF.getRegInfo();
+ TII = ST.getInstrInfo();
+
+ assert(MRI->isSSA() && "Must be run on SSA");
+
+ bool Changed = false;
+ for (auto &MBB : MF) {
+ for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
+ auto &MI = *I++;
+ if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
+ Changed = true;
+ ++NumDPPMovsCombined;
+ }
+ }
+ }
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index f236f10ba75ab..c6396de89c4f6 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -215,6 +215,14 @@ void GCNHazardRecognizer::AdvanceCycle() {
if (!CurrCycleInstr)
return;
+ // Do not track non-instructions which do not affect the wait states.
+ // If included, these instructions can lead to buffer overflow such that
+ // detectable hazards are missed.
+ if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)
+ return;
+ else if (CurrCycleInstr->isDebugInstr())
+ return;
+
unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
// Keep track of emitted instructions
@@ -253,8 +261,7 @@ int GCNHazardRecognizer::getWaitStatesSince(
return WaitStates;
unsigned Opcode = MI->getOpcode();
- if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF ||
- Opcode == AMDGPU::INLINEASM)
+ if (Opcode == AMDGPU::INLINEASM)
continue;
}
++WaitStates;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index 651091d441364..d62dc8d86781c 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -335,7 +335,7 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots,
assert(C);
AvailQueue.remove(*C);
auto SU = C->SU;
- LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU));
advanceToCycle(SU->getHeight());
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 15366d66bd852..8e4cc391dc21c 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -434,8 +434,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
// Sort recorded regions by pressure - highest at the front
void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
const auto &ST = MF.getSubtarget<GCNSubtarget>();
- llvm::sort(Regions.begin(), Regions.end(),
- [&ST, TargetOcc](const Region *R1, const Region *R2) {
+ llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) {
return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
});
}
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index 192d534bb9cfd..ec6bcae335551 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -258,7 +258,7 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
assert(C);
RQ.remove(*C);
auto SU = C->SU;
- LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+ LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU));
releaseSuccessors(SU, StepNo);
Schedule.push_back(SU);
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index d76acfa24f901..b8142a4e4ff88 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -156,3 +156,8 @@ def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
[FeatureISAVersion9_0_6]
>;
+
+def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_9]
+>;
+
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index db908368a1791..fab0f87dfcbea 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -207,9 +207,12 @@ void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "da");
}
-void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "r128");
+ if (STI.hasFeature(AMDGPU::FeatureR128A16))
+ printNamedBit(MI, OpNo, O, "a16");
+ else
+ printNamedBit(MI, OpNo, O, "r128");
}
void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
@@ -236,21 +239,12 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
O << " vm";
}
-void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm()) {
- O << " dfmt:";
- printU8ImmDecOperand(MI, OpNo, O);
- }
-}
-
-void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- if (MI->getOperand(OpNo).getImm()) {
- O << " nfmt:";
- printU8ImmDecOperand(MI, OpNo, O);
+void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ if (unsigned Val = MI->getOperand(OpNo).getImm()) {
+ O << " dfmt:" << (Val & 15);
+ O << ", nfmt:" << (Val >> 4);
}
}
@@ -1161,8 +1155,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- AMDGPU::IsaInfo::IsaVersion ISA =
- AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits());
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU());
unsigned SImm16 = MI->getOperand(OpNo).getImm();
unsigned Vmcnt, Expcnt, Lgkmcnt;
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 11a496a38b2cd..0ba74ca0f3e19 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -80,7 +80,7 @@ private:
raw_ostream &O);
void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printLWE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -90,10 +90,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpVM(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printDFMT(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
- void printNFMT(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
+ void printFORMAT(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt
index c54a13c4b4d88..e591d756a545e 100644
--- a/lib/Target/AMDGPU/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/LLVMBuild.txt
@@ -30,5 +30,5 @@ has_disassembler = 1
type = Library
name = AMDGPUCodeGen
parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel BinaryFormat
add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 07bef9103c0d8..c85a1ea5b0549 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -46,11 +46,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
if (const auto *SymA = Target.getSymA()) {
// SCRATCH_RSRC_DWORD[01] is a special global variable that represents
// the scratch buffer.
- if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+ if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0" ||
+ SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
return ELF::R_AMDGPU_ABS32_LO;
-
- if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
- return ELF::R_AMDGPU_ABS32_HI;
}
switch (Target.getAccessVariant()) {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 6a41e3f650bc2..c17fe126546ce 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -17,7 +17,9 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Metadata.h"
@@ -27,6 +29,7 @@
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetParser.h"
namespace llvm {
#include "AMDGPUPTNote.h"
@@ -34,95 +37,116 @@ namespace llvm {
using namespace llvm;
using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::HSAMD;
//===----------------------------------------------------------------------===//
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
-static const struct {
- const char *Name;
- unsigned Mach;
-} MachTable[] = {
- // Radeon HD 2000/3000 Series (R600).
- { "r600", ELF::EF_AMDGPU_MACH_R600_R600 },
- { "r630", ELF::EF_AMDGPU_MACH_R600_R630 },
- { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 },
- { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 },
- // Radeon HD 4000 Series (R700).
- { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 },
- { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 },
- { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 },
- // Radeon HD 5000 Series (Evergreen).
- { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR },
- { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS },
- { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER },
- { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD },
- { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO },
- // Radeon HD 6000 Series (Northern Islands).
- { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS },
- { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS },
- { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN },
- { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS },
- // AMDGCN GFX6.
- { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
- { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 },
- { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
- { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
- { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
- { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
- { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 },
- // AMDGCN GFX7.
- { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
- { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 },
- { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
- { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 },
- { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 },
- { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
- { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
- { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 },
- { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
- { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 },
- // AMDGCN GFX8.
- { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
- { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 },
- { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
- { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
- { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 },
- { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
- { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
- { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
- { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 },
- { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
- { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 },
- // AMDGCN GFX9.
- { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 },
- { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 },
- { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 },
- { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 },
- // Not specified processor.
- { nullptr, ELF::EF_AMDGPU_MACH_NONE }
-};
+bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
+ HSAMD::Metadata HSAMetadata;
+ if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
+ return false;
-unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
- auto Entry = MachTable;
- for (; Entry->Name && GPU != Entry->Name; ++Entry)
- ;
- return Entry->Mach;
+ return EmitHSAMetadata(HSAMetadata);
}
-const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) {
- auto Entry = MachTable;
- for (; Entry->Name && Mach != Entry->Mach; ++Entry)
- ;
- return Entry->Name;
+bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
+ std::shared_ptr<msgpack::Node> HSAMetadataRoot;
+ yaml::Input YIn(HSAMetadataString);
+ YIn >> HSAMetadataRoot;
+ if (YIn.error())
+ return false;
+ return EmitHSAMetadata(HSAMetadataRoot, false);
}
-bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) {
- HSAMD::Metadata HSAMetadata;
- if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
- return false;
+StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
+ AMDGPU::GPUKind AK;
- return EmitHSAMetadata(HSAMetadata);
+ switch (ElfMach) {
+ case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
+ case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
+ case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break;
+ case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break;
+ case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break;
+ case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break;
+ case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break;
+ case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break;
+ case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break;
+ case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break;
+ case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break;
+ case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break;
+ case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break;
+ case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
+ }
+
+ StringRef GPUName = getArchNameAMDGCN(AK);
+ if (GPUName != "")
+ return GPUName;
+ return getArchNameR600(AK);
+}
+
+unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
+ AMDGPU::GPUKind AK = parseArchAMDGCN(GPU);
+ if (AK == AMDGPU::GPUKind::GK_NONE)
+ AK = parseArchR600(GPU);
+
+ switch (AK) {
+ case GK_R600: return ELF::EF_AMDGPU_MACH_R600_R600;
+ case GK_R630: return ELF::EF_AMDGPU_MACH_R600_R630;
+ case GK_RS880: return ELF::EF_AMDGPU_MACH_R600_RS880;
+ case GK_RV670: return ELF::EF_AMDGPU_MACH_R600_RV670;
+ case GK_RV710: return ELF::EF_AMDGPU_MACH_R600_RV710;
+ case GK_RV730: return ELF::EF_AMDGPU_MACH_R600_RV730;
+ case GK_RV770: return ELF::EF_AMDGPU_MACH_R600_RV770;
+ case GK_CEDAR: return ELF::EF_AMDGPU_MACH_R600_CEDAR;
+ case GK_CYPRESS: return ELF::EF_AMDGPU_MACH_R600_CYPRESS;
+ case GK_JUNIPER: return ELF::EF_AMDGPU_MACH_R600_JUNIPER;
+ case GK_REDWOOD: return ELF::EF_AMDGPU_MACH_R600_REDWOOD;
+ case GK_SUMO: return ELF::EF_AMDGPU_MACH_R600_SUMO;
+ case GK_BARTS: return ELF::EF_AMDGPU_MACH_R600_BARTS;
+ case GK_CAICOS: return ELF::EF_AMDGPU_MACH_R600_CAICOS;
+ case GK_CAYMAN: return ELF::EF_AMDGPU_MACH_R600_CAYMAN;
+ case GK_TURKS: return ELF::EF_AMDGPU_MACH_R600_TURKS;
+ case GK_GFX600: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX600;
+ case GK_GFX601: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX601;
+ case GK_GFX700: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX700;
+ case GK_GFX701: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX701;
+ case GK_GFX702: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX702;
+ case GK_GFX703: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX703;
+ case GK_GFX704: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX704;
+ case GK_GFX801: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX801;
+ case GK_GFX802: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX802;
+ case GK_GFX803: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX803;
+ case GK_GFX810: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX810;
+ case GK_GFX900: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX900;
+ case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
+ case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
+ case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
+ case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
+ case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
+ }
+
+ llvm_unreachable("unknown GPU");
}
//===----------------------------------------------------------------------===//
@@ -183,9 +207,26 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
if (HSAMD::toString(HSAMetadata, HSAMetadataString))
return false;
- OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
+ OS << '\t' << AssemblerDirectiveBegin << '\n';
OS << HSAMetadataString << '\n';
- OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
+ OS << '\t' << AssemblerDirectiveEnd << '\n';
+ return true;
+}
+
+bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
+ std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+ V3::MetadataVerifier Verifier(Strict);
+ if (!Verifier.verify(*HSAMetadataRoot))
+ return false;
+
+ std::string HSAMetadataString;
+ raw_string_ostream StrOS(HSAMetadataString);
+ yaml::Output YOut(StrOS);
+ YOut << HSAMetadataRoot;
+
+ OS << '\t' << V3::AssemblerDirectiveBegin << '\n';
+ OS << StrOS.str() << '\n';
+ OS << '\t' << V3::AssemblerDirectiveEnd << '\n';
return true;
}
@@ -203,70 +244,59 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
- amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor();
-
- IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits());
+ IsaVersion IVersion = getIsaVersion(STI.getCPU());
OS << "\t.amdhsa_kernel " << KernelName << '\n';
-#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC, \
- DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \
- if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) != \
- AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME)) \
- STREAM << "\t\t" << DIRECTIVE << " " \
- << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
+#define PRINT_FIELD(STREAM, DIRECTIVE, KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \
+ STREAM << "\t\t" << DIRECTIVE << " " \
+ << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n';
- if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size)
- OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
- << '\n';
- if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size)
- OS << "\t\t.amdhsa_private_segment_fixed_size "
- << KD.private_segment_fixed_size << '\n';
+ OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size
+ << '\n';
+ OS << "\t\t.amdhsa_private_segment_fixed_size "
+ << KD.private_segment_fixed_size << '\n';
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD,
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_queue_ptr", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ PRINT_FIELD(
+ OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
+ PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
+ PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
+ PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
+ PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_info", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
+ PRINT_FIELD(OS, ".amdhsa_system_vgpr_workitem_id", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
// These directives are required.
OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n';
@@ -279,54 +309,52 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+ PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
+ PRINT_FIELD(OS, ".amdhsa_float_round_mode_16_64", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
+ PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_32", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
+ PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
+ PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+ PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
if (IVersion.Major >= 9)
- PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD,
+ PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
+ compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ PRINT_FIELD(
+ OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD,
+ PRINT_FIELD(OS, ".amdhsa_exception_fp_denorm_src", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
+ PRINT_FIELD(
+ OS, ".amdhsa_exception_fp_ieee_div_zero", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
- PRINT_IF_NOT_DEFAULT(
- OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
-#undef PRINT_IF_NOT_DEFAULT
+ PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_overflow", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
+ PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_underflow", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
+ PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_inexact", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
+ PRINT_FIELD(OS, ".amdhsa_exception_int_div_zero", KD,
+ compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
+#undef PRINT_FIELD
OS << "\t.end_amdhsa_kernel\n";
}
@@ -342,12 +370,16 @@ AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
unsigned EFlags = MCA.getELFHeaderEFlags();
EFlags &= ~ELF::EF_AMDGPU_MACH;
- EFlags |= getMACH(STI.getCPU());
+ EFlags |= getElfMach(STI.getCPU());
EFlags &= ~ELF::EF_AMDGPU_XNACK;
if (AMDGPU::hasXNACK(STI))
EFlags |= ELF::EF_AMDGPU_XNACK;
+ EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
+ if (AMDGPU::hasSRAMECC(STI))
+ EFlags |= ELF::EF_AMDGPU_SRAM_ECC;
+
MCA.setELFHeaderEFlags(EFlags);
}
@@ -355,13 +387,13 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
}
-void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
- const MCExpr *DescSZ, unsigned NoteType,
+void AMDGPUTargetELFStreamer::EmitNote(
+ StringRef Name, const MCExpr *DescSZ, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc) {
auto &S = getStreamer();
auto &Context = S.getContext();
- auto NameSZ = sizeof(ElfNote::NoteName);
+ auto NameSZ = Name.size() + 1;
S.PushSection();
S.SwitchSection(Context.getELFSection(
@@ -369,7 +401,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
S.EmitIntValue(NameSZ, 4); // namesz
S.EmitValue(DescSZ, 4); // descz
S.EmitIntValue(NoteType, 4); // type
- S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name
+ S.EmitBytes(Name); // name
S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
EmitDesc(S); // desc
S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
@@ -381,14 +413,11 @@ void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
uint32_t Major, uint32_t Minor) {
- EmitAMDGPUNote(
- MCConstantExpr::create(8, getContext()),
- ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
- [&](MCELFStreamer &OS){
- OS.EmitIntValue(Major, 4);
- OS.EmitIntValue(Minor, 4);
- }
- );
+ EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
+ ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
+ OS.EmitIntValue(Major, 4);
+ OS.EmitIntValue(Minor, 4);
+ });
}
void
@@ -404,21 +433,18 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
VendorNameSize + ArchNameSize;
- EmitAMDGPUNote(
- MCConstantExpr::create(DescSZ, getContext()),
- ElfNote::NT_AMDGPU_HSA_ISA,
- [&](MCELFStreamer &OS) {
- OS.EmitIntValue(VendorNameSize, 2);
- OS.EmitIntValue(ArchNameSize, 2);
- OS.EmitIntValue(Major, 4);
- OS.EmitIntValue(Minor, 4);
- OS.EmitIntValue(Stepping, 4);
- OS.EmitBytes(VendorName);
- OS.EmitIntValue(0, 1); // NULL terminate VendorName
- OS.EmitBytes(ArchName);
- OS.EmitIntValue(0, 1); // NULL terminte ArchName
- }
- );
+ EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
+ ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
+ OS.EmitIntValue(VendorNameSize, 2);
+ OS.EmitIntValue(ArchNameSize, 2);
+ OS.EmitIntValue(Major, 4);
+ OS.EmitIntValue(Minor, 4);
+ OS.EmitIntValue(Stepping, 4);
+ OS.EmitBytes(VendorName);
+ OS.EmitIntValue(0, 1); // NULL terminate VendorName
+ OS.EmitBytes(ArchName);
+ OS.EmitIntValue(0, 1); // NULL terminte ArchName
+ });
}
void
@@ -447,15 +473,41 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);
- EmitAMDGPUNote(
- DescSZ,
- ELF::NT_AMD_AMDGPU_ISA,
- [&](MCELFStreamer &OS) {
- OS.EmitLabel(DescBegin);
- OS.EmitBytes(IsaVersionString);
- OS.EmitLabel(DescEnd);
- }
- );
+ EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitLabel(DescBegin);
+ OS.EmitBytes(IsaVersionString);
+ OS.EmitLabel(DescEnd);
+ });
+ return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
+ std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) {
+ V3::MetadataVerifier Verifier(Strict);
+ if (!Verifier.verify(*HSAMetadataRoot))
+ return false;
+
+ std::string HSAMetadataString;
+ raw_string_ostream StrOS(HSAMetadataString);
+ msgpack::Writer MPWriter(StrOS);
+ HSAMetadataRoot->write(MPWriter);
+
+ // Create two labels to mark the beginning and end of the desc field
+ // and a MCExpr to calculate the size of the desc field.
+ auto &Context = getContext();
+ auto *DescBegin = Context.createTempSymbol();
+ auto *DescEnd = Context.createTempSymbol();
+ auto *DescSZ = MCBinaryExpr::createSub(
+ MCSymbolRefExpr::create(DescEnd, Context),
+ MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+ EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitLabel(DescBegin);
+ OS.EmitBytes(StrOS.str());
+ OS.EmitLabel(DescEnd);
+ });
return true;
}
@@ -474,28 +526,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);
- EmitAMDGPUNote(
- DescSZ,
- ELF::NT_AMD_AMDGPU_HSA_METADATA,
- [&](MCELFStreamer &OS) {
- OS.EmitLabel(DescBegin);
- OS.EmitBytes(HSAMetadataString);
- OS.EmitLabel(DescEnd);
- }
- );
+ EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
+ [&](MCELFStreamer &OS) {
+ OS.EmitLabel(DescBegin);
+ OS.EmitBytes(HSAMetadataString);
+ OS.EmitLabel(DescEnd);
+ });
return true;
}
bool AMDGPUTargetELFStreamer::EmitPALMetadata(
const PALMD::Metadata &PALMetadata) {
- EmitAMDGPUNote(
- MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()),
- ELF::NT_AMD_AMDGPU_PAL_METADATA,
- [&](MCELFStreamer &OS){
- for (auto I : PALMetadata)
- OS.EmitIntValue(I, sizeof(uint32_t));
- }
- );
+ EmitNote(ElfNote::NoteNameV2,
+ MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t),
+ getContext()),
+ ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) {
+ for (auto I : PALMetadata)
+ OS.EmitIntValue(I, sizeof(uint32_t));
+ });
return true;
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 472da1b735936..9a807c804f9ff 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -11,6 +11,7 @@
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
#include "AMDKernelCodeT.h"
+#include "llvm/BinaryFormat/MsgPackTypes.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -31,13 +32,7 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
protected:
MCContext &getContext() const { return Streamer.getContext(); }
- /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name.
- unsigned getMACH(StringRef GPU) const;
-
public:
- /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value.
- static const char *getMachName(unsigned Mach);
-
AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
@@ -58,7 +53,20 @@ public:
virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
/// \returns True on success, false on failure.
- virtual bool EmitHSAMetadata(StringRef HSAMetadataString);
+ virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
+
+ /// \returns True on success, false on failure.
+ virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString);
+
+ /// Emit HSA Metadata
+ ///
+ /// When \p Strict is true, known metadata elements must already be
+ /// well-typed. When \p Strict is false, known types are inferred and
+ /// the \p HSAMetadata structure is updated with the correct types.
+ ///
+ /// \returns True on success, false on failure.
+ virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+ bool Strict) = 0;
/// \returns True on success, false on failure.
virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0;
@@ -71,6 +79,9 @@ public:
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
bool ReserveXNACK) = 0;
+
+ static StringRef getArchNameFromElfMach(unsigned ElfMach);
+ static unsigned getElfMach(StringRef GPU);
};
class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
@@ -95,6 +106,10 @@ public:
bool EmitISAVersion(StringRef IsaVersionString) override;
/// \returns True on success, false on failure.
+ bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+ bool Strict) override;
+
+ /// \returns True on success, false on failure.
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
@@ -110,8 +125,8 @@ public:
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
MCStreamer &Streamer;
- void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType,
- function_ref<void(MCELFStreamer &)> EmitDesc);
+ void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
+ function_ref<void(MCELFStreamer &)> EmitDesc);
public:
AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
@@ -135,6 +150,10 @@ public:
bool EmitISAVersion(StringRef IsaVersionString) override;
/// \returns True on success, false on failure.
+ bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata,
+ bool Strict) override;
+
+ /// \returns True on success, false on failure.
bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
/// \returns True on success, false on failure.
diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
index 773ee7c0a4ba7..bc910a470d72a 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = AMDGPUDesc
parent = AMDGPU
-required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support
+required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support BinaryFormat
add_to_library_groups = AMDGPU
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 44c2d366e4613..1c68dbd78e758 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -29,6 +29,7 @@ class MIMGBaseOpcode {
bit Atomic = 0;
bit AtomicX2 = 0; // (f)cmpswap
bit Sampler = 0;
+ bit Gather4 = 0;
bits<8> NumExtraArgs = 0;
bit Gradients = 0;
bit Coordinates = 1;
@@ -43,7 +44,7 @@ def MIMGBaseOpcode : GenericEnum {
def MIMGBaseOpcodesTable : GenericTable {
let FilterClass = "MIMGBaseOpcode";
let CppTypeName = "MIMGBaseOpcodeInfo";
- let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+ let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
"NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
"HasD16"];
GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
@@ -141,7 +142,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
- R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
@@ -179,6 +180,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
let VDataDwords = 4 in
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
+ let VDataDwords = 8 in
+ defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
}
}
@@ -199,7 +202,7 @@ class MIMG_Store_Helper <bits<7> op, string asm,
let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
- R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
@@ -252,7 +255,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
- R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
+ R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da);
let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da";
}
@@ -316,7 +319,7 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc,
let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc,
- R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
+ R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da"
#!if(BaseOpcode.HasD16, "$d16", "");
@@ -411,6 +414,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
+ let VDataDwords = 8 in
+ defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
}
}
@@ -421,6 +426,7 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
string asm = "image_gather4"#sample.LowerCaseMod> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = 1;
+ let Gather4 = 1;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -429,6 +435,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+ let VDataDwords = 8 in
+ defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
}
}
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 1683fe6c9a571..679cf18d2c20b 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -226,11 +226,11 @@ private:
// occur in the same basic block as its definition, because
// it is illegal for the scheduler to schedule them in
// different blocks.
- if (UseI->readsRegister(MOI->getReg()))
+ if (UseI->readsRegister(MOI->getReg(), &TRI))
LastUseCount = AluInstCount;
// Exit early if the current use kills the register
- if (UseI != Def && UseI->killsRegister(MOI->getReg()))
+ if (UseI != Def && UseI->killsRegister(MOI->getReg(), &TRI))
break;
}
if (LastUseCount)
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 113d6249fa60a..e2a0f05d2b34d 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -589,7 +589,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
}
case Intrinsic::r600_implicitarg_ptr: {
- MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
+ MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT);
return DAG.getConstant(ByteOffset, DL, PtrVT);
}
@@ -741,12 +741,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
- if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+ if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
const DataLayout &DL = DAG.getDataLayout();
const GlobalValue *GV = GSD->getGlobal();
- MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
+ MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
@@ -903,7 +903,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
unsigned DwordOffset) const {
unsigned ByteOffset = DwordOffset * 4;
PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
- AMDGPUASI.CONSTANT_BUFFER_0);
+ AMDGPUAS::PARAM_I_ADDRESS);
// We shouldn't be using an offset wider than 16-bits for implicit parameters.
assert(isInt<16>(ByteOffset));
@@ -1141,7 +1141,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
//TODO: Who creates the i8 stores?
assert(Store->isTruncatingStore()
|| Store->getValue().getValueType() == MVT::i8);
- assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+ assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
SDValue Mask;
if (Store->getMemoryVT() == MVT::i8) {
@@ -1175,7 +1175,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
// Load dword
// TODO: can we be smarter about machine pointer info?
MachinePointerInfo PtrInfo(UndefValue::get(
- Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
+ Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)));
SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
Chain = Dst.getValue(1);
@@ -1241,9 +1241,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
// Neither LOCAL nor PRIVATE can do vectors at the moment
- if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
VT.isVector()) {
- if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+ if ((AS == AMDGPUAS::PRIVATE_ADDRESS) &&
StoreNode->isTruncatingStore()) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
@@ -1267,7 +1267,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
DAG.getConstant(2, DL, PtrVT));
- if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
// It is beneficial to create MSKOR here instead of combiner to avoid
// artificial dependencies introduced by RMW
if (StoreNode->isTruncatingStore()) {
@@ -1320,7 +1320,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
// GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
- if (AS != AMDGPUASI.PRIVATE_ADDRESS)
+ if (AS != AMDGPUAS::PRIVATE_ADDRESS)
return SDValue();
if (MemVT.bitsLT(MVT::i32))
@@ -1403,7 +1403,7 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
// Load dword
// TODO: can we be smarter about machine pointer info?
MachinePointerInfo PtrInfo(UndefValue::get(
- Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)));
+ Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)));
SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo);
// Get offset within the register.
@@ -1441,7 +1441,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT MemVT = LoadNode->getMemoryVT();
ISD::LoadExtType ExtType = LoadNode->getExtensionType();
- if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
return lowerPrivateExtLoad(Op, DAG);
}
@@ -1451,45 +1451,29 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = LoadNode->getChain();
SDValue Ptr = LoadNode->getBasePtr();
- if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
- LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
+ if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
VT.isVector()) {
return scalarizeVectorLoad(LoadNode, DAG);
}
+ // This is still used for explicit load from addrspace(8)
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
if (ConstantBlock > -1 &&
((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
(LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
SDValue Result;
- if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
- isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+ if (isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
isa<ConstantSDNode>(Ptr)) {
- SDValue Slots[4];
- for (unsigned i = 0; i < 4; i++) {
- // We want Const position encoded with the following formula :
- // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
- // const_index is Ptr computed by llvm using an alignment of 16.
- // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
- // then div by 4 at the ISel step
- SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
- DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
- Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
- }
- EVT NewVT = MVT::v4i32;
- unsigned NumElements = 4;
- if (VT.isVector()) {
- NewVT = VT;
- NumElements = VT.getVectorNumElements();
- }
- Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+ return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG);
} else {
+ //TODO: Does this even work?
// non-constant ptr can't be folded, keeps it as a v4f32 load
Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
DAG.getConstant(4, DL, MVT::i32)),
DAG.getConstant(LoadNode->getAddressSpace() -
- AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
+ AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
);
}
@@ -1525,7 +1509,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(MergedValues, DL);
}
- if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
+ if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
}
@@ -1622,7 +1606,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
}
PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
- AMDGPUASI.CONSTANT_BUFFER_0);
+ AMDGPUAS::PARAM_I_ADDRESS);
// i64 isn't a legal type, so the register type used ends up as i32, which
// isn't expected here. It attempts to create this sextload, but it ends up
@@ -1646,17 +1630,17 @@ SDValue R600TargetLowering::LowerFormalArguments(
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
+ unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(
ISD::UNINDEXED, Ext, VT, DL, Chain,
DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
PtrInfo,
- MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+ MemVT, Alignment, MachineMemOperand::MONonTemporal |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
- // 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
}
return Chain;
@@ -1672,7 +1656,7 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const {
// Local and Private addresses do not handle vectors. Limit to i32
- if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) {
return (MemVT.getSizeInBits() <= 32);
}
return true;
@@ -1701,14 +1685,15 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
static SDValue CompactSwizzlableVector(
SelectionDAG &DAG, SDValue VectorEntry,
DenseMap<unsigned, unsigned> &RemapSwizzle) {
- assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
assert(RemapSwizzle.empty());
- SDValue NewBldVec[4] = {
- VectorEntry.getOperand(0),
- VectorEntry.getOperand(1),
- VectorEntry.getOperand(2),
- VectorEntry.getOperand(3)
- };
+
+ SDLoc DL(VectorEntry);
+ EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+ SDValue NewBldVec[4];
+ for (unsigned i = 0; i < 4; i++)
+ NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+ DAG.getIntPtrConstant(i, DL));
for (unsigned i = 0; i < 4; i++) {
if (NewBldVec[i].isUndef())
@@ -1743,15 +1728,17 @@ static SDValue CompactSwizzlableVector(
static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
DenseMap<unsigned, unsigned> &RemapSwizzle) {
- assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
assert(RemapSwizzle.empty());
- SDValue NewBldVec[4] = {
- VectorEntry.getOperand(0),
- VectorEntry.getOperand(1),
- VectorEntry.getOperand(2),
- VectorEntry.getOperand(3)
- };
- bool isUnmovable[4] = { false, false, false, false };
+
+ SDLoc DL(VectorEntry);
+ EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+ SDValue NewBldVec[4];
+ bool isUnmovable[4] = {false, false, false, false};
+ for (unsigned i = 0; i < 4; i++)
+ NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+ DAG.getIntPtrConstant(i, DL));
+
for (unsigned i = 0; i < 4; i++) {
RemapSwizzle[i] = i;
if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
@@ -1782,7 +1769,6 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
SelectionDAG &DAG,
const SDLoc &DL) const {
- assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
// Old -> New swizzle values
DenseMap<unsigned, unsigned> SwizzleRemap;
@@ -1804,6 +1790,52 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
return BuildVector;
}
+SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block,
+ SelectionDAG &DAG) const {
+ SDLoc DL(LoadNode);
+ EVT VT = LoadNode->getValueType(0);
+ SDValue Chain = LoadNode->getChain();
+ SDValue Ptr = LoadNode->getBasePtr();
+ assert (isa<ConstantSDNode>(Ptr));
+
+ //TODO: Support smaller loads
+ if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode))
+ return SDValue();
+
+ if (LoadNode->getAlignment() < 4)
+ return SDValue();
+
+ int ConstantBlock = ConstantAddressBlock(Block);
+
+ SDValue Slots[4];
+ for (unsigned i = 0; i < 4; i++) {
+ // We want Const position encoded with the following formula :
+ // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+ // const_index is Ptr computed by llvm using an alignment of 16.
+ // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+ // then div by 4 at the ISel step
+ SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
+ Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+ }
+ EVT NewVT = MVT::v4i32;
+ unsigned NumElements = 4;
+ if (VT.isVector()) {
+ NewVT = VT;
+ NumElements = VT.getVectorNumElements();
+ }
+ SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+ if (!VT.isVector()) {
+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+ SDValue MergedValues[2] = {
+ Result,
+ Chain
+ };
+ return DAG.getMergeValues(MergedValues, DL);
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG Optimizations
//===----------------------------------------------------------------------===//
@@ -2022,6 +2054,16 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
}
+
+ case ISD::LOAD: {
+ LoadSDNode *LoadNode = cast<LoadSDNode>(N);
+ SDValue Ptr = LoadNode->getBasePtr();
+ if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS &&
+ isa<ConstantSDNode>(Ptr))
+ return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG);
+ break;
+ }
+
default: break;
}
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 907d1f10e1519..767c3c7bd5bfe 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -98,9 +98,11 @@ private:
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
- bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
- SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
- SelectionDAG &DAG) const;
+ bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+ SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+ SelectionDAG &DAG) const;
+ SDValue constBufferLoad(LoadSDNode *LoadNode, int Block,
+ SelectionDAG &DAG) const;
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
};
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 5397e779474c8..9cc3e5f3c314c 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -229,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
}
bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
- return MI.findRegisterUseOperandIdx(R600::AR_X) != -1;
+ return MI.findRegisterUseOperandIdx(R600::AR_X, false, &RI) != -1;
}
bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
- return MI.findRegisterDefOperandIdx(R600::AR_X) != -1;
+ return MI.findRegisterDefOperandIdx(R600::AR_X, false, false, &RI) != -1;
}
bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
@@ -1500,19 +1500,19 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
}
unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind(
- PseudoSourceValue::PSVKind Kind) const {
+ unsigned Kind) const {
switch (Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return ST.getAMDGPUAS().PRIVATE_ADDRESS;
+ return AMDGPUAS::PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return ST.getAMDGPUAS().CONSTANT_ADDRESS;
+ return AMDGPUAS::CONSTANT_ADDRESS;
}
+
llvm_unreachable("Invalid pseudo source kind");
- return ST.getAMDGPUAS().PRIVATE_ADDRESS;
}
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index 7a3dece316650..e6e34dc125f4d 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -324,7 +324,7 @@ public:
}
unsigned getAddressSpaceForPseudoSourceKind(
- PseudoSourceValue::PSVKind Kind) const override;
+ unsigned Kind) const override;
};
namespace R600 {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 7bf174f4cd864..10e8737552224 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -299,7 +299,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
class LoadParamFrag <PatFrag load_type> : PatFrag <
(ops node:$ptr), (load_type node:$ptr),
[{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
- (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }]
+ (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
>;
def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
@@ -309,8 +309,8 @@ def vtx_id3_load : LoadParamFrag<load>;
class LoadVtxId1 <PatFrag load> : PatFrag <
(ops node:$ptr), (load node:$ptr), [{
const MemSDNode *LD = cast<MemSDNode>(N);
- return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+ return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
!isa<GlobalValue>(GetUnderlyingObject(
LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
}]>;
@@ -322,7 +322,7 @@ def vtx_id1_load : LoadVtxId1 <load>;
class LoadVtxId2 <PatFrag load> : PatFrag <
(ops node:$ptr), (load node:$ptr), [{
const MemSDNode *LD = cast<MemSDNode>(N);
- return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+ return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
isa<GlobalValue>(GetUnderlyingObject(
LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
}]>;
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index a1429a2ac50f1..7769a35aadcee 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -127,13 +127,13 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
LLVM_DEBUG(if (SU) {
dbgs() << " ** Pick node **\n";
- SU->dump(DAG);
+ DAG->dumpNode(*SU);
} else {
dbgs() << "NO NODE \n";
for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
const SUnit &S = DAG->SUnits[i];
if (!S.isScheduled)
- S.dump(DAG);
+ DAG->dumpNode(S);
}
});
@@ -188,11 +188,11 @@ isPhysicalRegCopy(MachineInstr *MI) {
}
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
- LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
+ LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
}
void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
- LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
+ LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
if (isPhysicalRegCopy(SU->getInstr())) {
PhysicalRegCopy.push_back(SU);
return;
@@ -236,6 +236,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
// MI will become a KILL, don't considers it in scheduling
return AluDiscarded;
}
+ break;
default:
break;
}
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
new file mode 100644
index 0000000000000..69cafef4a3513
--- /dev/null
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -0,0 +1,181 @@
+//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Any MIMG instructions that use tfe or lwe require an initialization of the
+/// result register that will be written in the case of a memory access failure
+/// The required code is also added to tie this init code to the result of the
+/// img instruction
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-img-init"
+
+using namespace llvm;
+
+namespace {
+
+class SIAddIMGInit : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIAddIMGInit() : MachineFunctionPass(ID) {
+ initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
+
+char SIAddIMGInit::ID = 0;
+
+char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
+
+FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
+
+bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *RI = ST.getRegisterInfo();
+ bool Changed = false;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+ ++BI) {
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ auto Opcode = MI.getOpcode();
+ if (TII->isMIMG(Opcode) && !MI.mayStore()) {
+ MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+ MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+ MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+ // Check for instructions that don't have tfe or lwe fields
+ // There shouldn't be any at this point.
+ assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
+
+ unsigned TFEVal = TFE->getImm();
+ unsigned LWEVal = LWE->getImm();
+ unsigned D16Val = D16 ? D16->getImm() : 0;
+
+ if (TFEVal || LWEVal) {
+ // At least one of TFE or LWE are non-zero
+ // We have to insert a suitable initialization of the result value and
+ // tie this to the dest of the image instruction.
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ int DstIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+
+ // Calculate which dword we have to initialize to 0.
+ MachineOperand *MO_Dmask =
+ TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+
+ // check that dmask operand is found.
+ assert(MO_Dmask && "Expected dmask operand in instruction");
+
+ unsigned dmask = MO_Dmask->getImm();
+ // Determine the number of active lanes taking into account the
+ // Gather4 special case
+ unsigned ActiveLanes =
+ TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
+
+ // Subreg indices are counted from 1
+ // When D16 then we want next whole VGPR after write data.
+ static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
+
+ bool Packed = !ST.hasUnpackedD16VMem();
+
+ unsigned InitIdx =
+ D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+
+ // Abandon attempt if the dst size isn't large enough
+ // - this is in fact an error but this is picked up elsewhere and
+ // reported correctly.
+ uint32_t DstSize =
+ RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ if (DstSize < InitIdx)
+ continue;
+
+ // Create a register for the intialization value.
+ unsigned PrevDst =
+ MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ unsigned NewDst = 0; // Final initialized value will be in here
+
+ // If PRTStrictNull feature is enabled (the default) then initialize
+ // all the result registers to 0, otherwise just the error indication
+ // register (VGPRn+1)
+ unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
+ unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
+
+ if (DstSize == 1) {
+ // In this case we can just initialize the result directly
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
+ .addImm(0);
+ NewDst = PrevDst;
+ } else {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
+ for (; SizeLeft; SizeLeft--, CurrIdx++) {
+ NewDst =
+ MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ // Initialize dword
+ unsigned SubReg =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
+ .addImm(0);
+ // Insert into the super-reg
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
+ .addReg(PrevDst)
+ .addReg(SubReg)
+ .addImm(CurrIdx);
+
+ PrevDst = NewDst;
+ }
+ }
+
+ // Add as an implicit operand
+ MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
+
+ // Tie the just added implicit operand to the dst
+ MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
+
+ Changed = true;
+ }
+ }
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 74f1bd8fb9866..98e9ea662324f 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -16,7 +16,7 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
@@ -52,7 +52,7 @@ using StackEntry = std::pair<BasicBlock *, Value *>;
using StackVector = SmallVector<StackEntry, 16>;
class SIAnnotateControlFlow : public FunctionPass {
- DivergenceAnalysis *DA;
+ LegacyDivergenceAnalysis *DA;
Type *Boolean;
Type *Void;
@@ -66,9 +66,7 @@ class SIAnnotateControlFlow : public FunctionPass {
Function *If;
Function *Else;
- Function *Break;
Function *IfBreak;
- Function *ElseBreak;
Function *Loop;
Function *EndCf;
@@ -95,8 +93,7 @@ class SIAnnotateControlFlow : public FunctionPass {
Value *
handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
- BranchInst *Term,
- SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);
+ BranchInst *Term);
void handleLoop(BranchInst *Term);
@@ -116,7 +113,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
AU.addPreserved<DominatorTreeWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
@@ -127,7 +124,7 @@ public:
INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
@@ -149,9 +146,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
- Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
- ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
return false;
@@ -160,7 +155,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
/// Is the branch condition uniform or did the StructurizeCFG pass
/// consider it as such?
bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
- return DA->isUniform(T->getCondition()) ||
+ return DA->isUniform(T) ||
T->getMetadata("structurizecfg.uniform") != nullptr;
}
@@ -227,76 +222,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
/// Recursively handle the condition leading to a loop
Value *SIAnnotateControlFlow::handleLoopCondition(
- Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
- SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
- // Only search through PHI nodes which are inside the loop. If we try this
- // with PHI nodes that are outside of the loop, we end up inserting new PHI
- // nodes outside of the loop which depend on values defined inside the loop.
- // This will break the module with
- // 'Instruction does not dominate all users!' errors.
- PHINode *Phi = nullptr;
- if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
- BasicBlock *Parent = Phi->getParent();
- PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
- Value *Ret = NewPhi;
-
- // Handle all non-constant incoming values first
- for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
- Value *Incoming = Phi->getIncomingValue(i);
- BasicBlock *From = Phi->getIncomingBlock(i);
- if (isa<ConstantInt>(Incoming)) {
- NewPhi->addIncoming(Broken, From);
- continue;
- }
-
- Phi->setIncomingValue(i, BoolFalse);
- Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
- Term, LoopPhiConditions);
- NewPhi->addIncoming(PhiArg, From);
- }
-
- BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
-
- for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
- Value *Incoming = Phi->getIncomingValue(i);
- if (Incoming != BoolTrue)
- continue;
-
- BasicBlock *From = Phi->getIncomingBlock(i);
- if (From == IDom) {
- // We're in the following situation:
- // IDom/From
- // | \
- // | If-block
- // | /
- // Parent
- // where we want to break out of the loop if the If-block is not taken.
- // Due to the depth-first traversal, there should be an end.cf
- // intrinsic in Parent, and we insert an else.break before it.
- //
- // Note that the end.cf need not be the first non-phi instruction
- // of parent, particularly when we're dealing with a multi-level
- // break, but it should occur within a group of intrinsic calls
- // at the beginning of the block.
- CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
- while (OldEnd && OldEnd->getCalledFunction() != EndCf)
- OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
- if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
- Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
- Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
- continue;
- }
- }
-
- TerminatorInst *Insert = From->getTerminator();
- Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
- NewPhi->setIncomingValue(i, PhiArg);
- }
-
- LoopPhiConditions.push_back(WeakTrackingVH(Phi));
- return Ret;
- }
-
+ Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) {
if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
BasicBlock *Parent = Inst->getParent();
Instruction *Insert;
@@ -335,21 +261,15 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
BasicBlock *Target = Term->getSuccessor(1);
PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
- SmallVector<WeakTrackingVH, 8> LoopPhiConditions;
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
- Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
+ Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
for (BasicBlock *Pred : predecessors(Target))
Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
- for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) {
- if (PHINode *Cond = cast_or_null<PHINode>(Val))
- eraseIfUnused(Cond);
- }
-
push(Term->getSuccessor(0), Arg);
}
@@ -372,7 +292,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
Preds.push_back(Pred);
}
- BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
+ BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
+ false);
}
Value *Exec = popSaved();
@@ -386,7 +307,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DA = &getAnalysis<DivergenceAnalysis>();
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
E = df_end(&F.getEntryBlock()); I != E; ++I) {
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a6d28d6999e5f..7f6abc34cff3a 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -88,7 +88,10 @@ enum : uint64_t {
IsPacked = UINT64_C(1) << 49,
// Is a D16 buffer instruction.
- D16Buf = UINT64_C(1) << 50
+ D16Buf = UINT64_C(1) << 50,
+
+ // Uses floating point double precision rounding mode
+ FPDPRounding = UINT64_C(1) << 51
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 566e0d3febc78..809f5bab46932 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy,
static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
- return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+ return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
+ TRI.hasVGPRs(SrcRC);
}
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
- return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+ return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
+ TRI.hasVGPRs(DstRC);
}
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
@@ -327,9 +329,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
switch (DefInstr->getOpcode()) {
default:
break;
- case AMDGPU::SI_BREAK:
case AMDGPU::SI_IF_BREAK:
- case AMDGPU::SI_ELSE_BREAK:
return true;
case AMDGPU::PHI:
if (phiHasBreakDef(*DefInstr, MRI, Visited))
@@ -599,7 +599,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
unsigned SrcReg = MI.getOperand(1).getReg();
if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
- TII->moveToVALU(MI);
+ TII->moveToVALU(MI, MDT);
break;
}
@@ -614,7 +614,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
MI.setDesc(TII->get(SMovOp));
break;
}
- TII->moveToVALU(MI);
+ TII->moveToVALU(MI, MDT);
} else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
}
@@ -677,7 +677,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
SmallSet<unsigned, 8> Visited;
if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
- TII->moveToVALU(MI);
+ TII->moveToVALU(MI, MDT);
}
break;
}
@@ -690,7 +690,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
- TII->moveToVALU(MI);
+ TII->moveToVALU(MI, MDT);
break;
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
@@ -700,7 +700,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
- TII->moveToVALU(MI);
+ TII->moveToVALU(MI, MDT);
}
break;
}
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
index 5d613d8874fab..7761418c53364 100644
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -10,7 +10,7 @@
/// \file
/// Computations in WWM can overwrite values in inactive channels for
/// variables that the register allocator thinks are dead. This pass adds fake
-/// uses of those variables to WWM instructions to make sure that they aren't
+/// uses of those variables to their def(s) to make sure that they aren't
/// overwritten.
///
/// As an example, consider this snippet:
@@ -29,25 +29,44 @@
/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
/// it would clobber even the inactive channels for which the if-condition is
/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
-/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// of %vgpr0 to its def to make sure they aren't allocated to the
/// same register.
///
/// In general, we need to figure out what registers might have their inactive
/// channels which are eventually used accidentally clobbered by a WWM
-/// instruction. We approximate this using two conditions:
+/// instruction. We do that by spotting three separate cases of registers:
///
-/// 1. A definition of the variable reaches the WWM instruction.
-/// 2. The variable would be live at the WWM instruction if all its defs were
-/// partial defs (i.e. considered as a use), ignoring normal uses.
+/// 1. A "then phi": the value resulting from phi elimination of a phi node at
+/// the end of an if..endif. If there is WWM code in the "then", then we
+/// make the def at the end of the "then" branch a partial def by adding an
+/// implicit use of the register.
///
-/// If a register matches both conditions, then we add an implicit use of it to
-/// the WWM instruction. Condition #2 is the heart of the matter: every
-/// definition is really a partial definition, since every VALU instruction is
-/// implicitly predicated. We can usually ignore this, but WWM forces us not
-/// to. Condition #1 prevents false positives if the variable is undefined at
-/// the WWM instruction anyways. This is overly conservative in certain cases,
-/// especially in uniform control flow, but this is a workaround anyways until
-/// LLVM gains the notion of predicated uses and definitions of variables.
+/// 2. A "loop exit register": a value written inside a loop but used outside the
+/// loop, where there is WWM code inside the loop (the case in the example
+/// above). We add an implicit_def of the register in the loop pre-header,
+/// and make the original def a partial def by adding an implicit use of the
+/// register.
+///
+/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node
+/// in a loop header. If there is WWM code inside the loop, then we make all
+/// defs inside the loop partial defs by adding an implicit use of the
+/// register on each one.
+///
+/// Note that we do not need to consider an if..else..endif phi. We only need to
+/// consider non-uniform control flow, and control flow structurization would
+/// have transformed a non-uniform if..else..endif into two if..endifs.
+///
+/// The analysis to detect these cases relies on a property of the MIR
+/// arising from this pass running straight after PHIElimination and before any
+/// coalescing: that any virtual register with more than one definition must be
+/// the new register added to lower a phi node by PHIElimination.
+///
+/// FIXME: We should detect whether a register in one of the above categories is
+/// already live at the WWM code before deciding to add the implicit uses to
+/// synthesize its liveness.
+///
+/// FIXME: I believe this whole scheme may be flawed due to the possibility of
+/// the register allocator doing live interval splitting.
///
//===----------------------------------------------------------------------===//
@@ -59,7 +78,9 @@
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -71,10 +92,18 @@ namespace {
class SIFixWWMLiveness : public MachineFunctionPass {
private:
+ MachineDominatorTree *DomTree;
+ MachineLoopInfo *LoopInfo;
LiveIntervals *LIS = nullptr;
+ const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
+ std::vector<MachineInstr *> WWMs;
+ std::vector<MachineOperand *> ThenDefs;
+ std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs;
+ std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs;
+
public:
static char ID;
@@ -84,13 +113,11 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- bool runOnWWMInstruction(MachineInstr &MI);
-
- void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
-
StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(MachineDominatorsID);
+ AU.addRequiredID(MachineLoopInfoID);
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
@@ -100,11 +127,21 @@ public:
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+private:
+ void processDef(MachineOperand &DefOpnd);
+ bool processThenDef(MachineOperand *DefOpnd);
+ bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop);
+ bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop);
};
} // End anonymous namespace.
-INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE,
+ "SI fix WWM liveness", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE,
"SI fix WWM liveness", false, false)
char SIFixWWMLiveness::ID = 0;
@@ -115,89 +152,267 @@ FunctionPass *llvm::createSIFixWWMLivenessPass() {
return new SIFixWWMLiveness();
}
-void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
-{
- for (const MachineOperand &Op : MI.defs()) {
- if (Op.isReg()) {
- unsigned Reg = Op.getReg();
- if (TRI->isVGPR(*MRI, Reg))
- Regs.set(Reg);
- }
- }
-}
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+ LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n");
+ bool Modified = false;
+
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
-bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
- MachineBasicBlock *MBB = WWM.getParent();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- // Compute the registers that are live out of MI by figuring out which defs
- // are reachable from MI.
- SparseBitVector<> LiveOut;
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
- for (auto II = MachineBasicBlock::iterator(WWM), IE =
- MBB->end(); II != IE; ++II) {
- addDefs(*II, LiveOut);
- }
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ LoopInfo = &getAnalysis<MachineLoopInfo>();
- for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
- E = df_end(MBB);
- I != E; ++I) {
- for (const MachineInstr &MI : **I) {
- addDefs(MI, LiveOut);
+ // Scan the function to find the WWM sections and the candidate registers for
+ // having liveness modified.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM)
+ WWMs.push_back(&MI);
+ else {
+ for (MachineOperand &DefOpnd : MI.defs()) {
+ if (DefOpnd.isReg()) {
+ unsigned Reg = DefOpnd.getReg();
+ if (TRI->isVGPR(*MRI, Reg))
+ processDef(DefOpnd);
+ }
+ }
+ }
}
}
+ if (!WWMs.empty()) {
+ // Synthesize liveness over WWM sections as required.
+ for (auto ThenDef : ThenDefs)
+ Modified |= processThenDef(ThenDef);
+ for (auto LoopExitDef : LoopExitDefs)
+ Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second);
+ for (auto LoopPhiDef : LoopPhiDefs)
+ Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second);
+ }
- // Compute the registers that reach MI.
- SparseBitVector<> Reachable;
+ WWMs.clear();
+ ThenDefs.clear();
+ LoopExitDefs.clear();
+ LoopPhiDefs.clear();
- for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
- MBB->rend(); II != IE; ++II) {
- addDefs(*II, Reachable);
- }
+ return Modified;
+}
- for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
- E = idf_end(MBB);
- I != E; ++I) {
- for (const MachineInstr &MI : **I) {
- addDefs(MI, Reachable);
+// During the function scan, process an operand that defines a VGPR.
+// This categorizes the register and puts it in the appropriate list for later
+// use when processing a WWM section.
+void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) {
+ unsigned Reg = DefOpnd.getReg();
+ // Get all the defining instructions. For convenience, make Defs[0] the def
+ // we are on now.
+ SmallVector<const MachineInstr *, 4> Defs;
+ Defs.push_back(DefOpnd.getParent());
+ for (auto &MI : MRI->def_instructions(Reg)) {
+ if (&MI != DefOpnd.getParent())
+ Defs.push_back(&MI);
+ }
+ // Check whether this def dominates all the others. If not, ignore this def.
+ // Either it is going to be processed when the scan encounters its other def
+ // that dominates all defs, or there is no def that dominates all others.
+ // The latter case is an eliminated phi from an if..else..endif or similar,
+ // which must be for uniform control flow so can be ignored.
+ // Because this pass runs shortly after PHIElimination, we assume that any
+ // multi-def register is a lowered phi, and thus has each def in a separate
+ // basic block.
+ for (unsigned I = 1; I != Defs.size(); ++I) {
+ if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent()))
+ return;
+ }
+ // Check for the case of an if..endif lowered phi: It has two defs, one
+ // dominates the other, and there is a single use in a successor of the
+ // dominant def.
+ // Later we will spot any WWM code inside
+ // the "then" clause and turn the second def into a partial def so its
+ // liveness goes through the WWM code in the "then" clause.
+ if (Defs.size() == 2) {
+ auto DomDefBlock = Defs[0]->getParent();
+ if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) {
+ auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
+ for (auto Succ : DomDefBlock->successors()) {
+ if (Succ == UseBlock) {
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n");
+ ThenDefs.push_back(&DefOpnd);
+ return;
+ }
+ }
}
}
-
- // find the intersection, and add implicit uses.
- LiveOut &= Reachable;
-
- bool Modified = false;
- for (unsigned Reg : LiveOut) {
- WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
- if (LIS) {
- // FIXME: is there a better way to update the live interval?
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ // Check for the case of a non-lowered-phi register (single def) that exits
+ // a loop, that is, it has a use that is outside a loop that the def is
+ // inside. We find the outermost loop that the def is inside but a use is
+ // outside. Later we will spot any WWM code inside that loop and then make
+ // the def a partial def so its liveness goes round the loop and through the
+ // WWM code.
+ if (Defs.size() == 1) {
+ auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent());
+ if (!Loop)
+ return;
+ bool IsLoopExit = false;
+ for (auto &Use : MRI->use_instructions(Reg)) {
+ auto UseBlock = Use.getParent();
+ if (Loop->contains(UseBlock))
+ continue;
+ IsLoopExit = true;
+ while (auto Parent = Loop->getParentLoop()) {
+ if (Parent->contains(UseBlock))
+ break;
+ Loop = Parent;
+ }
}
- Modified = true;
+ if (!IsLoopExit)
+ return;
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is a loop exit reg with loop header at "
+ << "bb." << Loop->getHeader()->getNumber() << "\n");
+ LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>(
+ &DefOpnd, Loop));
+ return;
}
-
- return Modified;
+ // Check for the case of a lowered single-preheader-loop phi, that is, a
+ // multi-def register where the dominating def is in the loop pre-header and
+ // all other defs are in backedges. Later we will spot any WWM code inside
+ // that loop and then make the backedge defs partial defs so the liveness
+ // goes through the WWM code.
+ // Note that we are ignoring multi-preheader loops on the basis that the
+ // structurizer does not allow that for non-uniform loops.
+ // There must be a single use in the loop header.
+ if (!MRI->hasOneUse(Reg))
+ return;
+ auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent();
+ auto Loop = LoopInfo->getLoopFor(UseBlock);
+ if (!Loop || Loop->getHeader() != UseBlock
+ || Loop->contains(Defs[0]->getParent())) {
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is multi-def but single use not in loop header\n");
+ return;
+ }
+ for (unsigned I = 1; I != Defs.size(); ++I) {
+ if (!Loop->contains(Defs[I]->getParent()))
+ return;
+ }
+ LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
+ << " is a loop phi reg with loop header at "
+ << "bb." << Loop->getHeader()->getNumber() << "\n");
+ LoopPhiDefs.push_back(
+ std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop));
}
-bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
- bool Modified = false;
-
- // This doesn't actually need LiveIntervals, but we can preserve them.
- LIS = getAnalysisIfAvailable<LiveIntervals>();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
-
- TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
+// Process a then phi def: It has two defs, one dominates the other, and there
+// is a single use in a successor of the dominant def. Here we spot any WWM
+// code inside the "then" clause and turn the second def into a partial def so
+// its liveness goes through the WWM code in the "then" clause.
+bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) {
+ LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent());
+ if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+ // Ignore if dominating def is undef.
+ LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n");
+ return false;
+ }
+ unsigned Reg = DefOpnd->getReg();
+ // Get the use block, which is the endif block.
+ auto UseBlock = MRI->use_instr_begin(Reg)->getParent();
+ // Check whether there is WWM code inside the then branch. The WWM code must
+ // be dominated by the if but not dominated by the endif.
+ bool ContainsWWM = false;
+ for (auto WWM : WWMs) {
+ if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent())
+ && !DomTree->dominates(UseBlock, WWM->getParent())) {
+ LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
+ ContainsWWM = true;
+ break;
+ }
+ }
+ if (!ContainsWWM)
+ return false;
+ // Get the other def.
+ MachineInstr *OtherDef = nullptr;
+ for (auto &MI : MRI->def_instructions(Reg)) {
+ if (&MI != DefOpnd->getParent())
+ OtherDef = &MI;
+ }
+ // Make it a partial def.
+ OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+ LLVM_DEBUG(dbgs() << *OtherDef);
+ return true;
+}
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
- Modified |= runOnWWMInstruction(MI);
- }
+// Process a loop exit def, that is, a register with a single use in a loop
+// that has a use outside the loop. Here we spot any WWM code inside that loop
+// and then make the def a partial def so its liveness goes round the loop and
+// through the WWM code.
+bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd,
+ MachineLoop *Loop) {
+ LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent());
+ // Check whether there is WWM code inside the loop.
+ bool ContainsWWM = false;
+ for (auto WWM : WWMs) {
+ if (Loop->contains(WWM->getParent())) {
+ LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
+ ContainsWWM = true;
+ break;
}
}
+ if (!ContainsWWM)
+ return false;
+ unsigned Reg = DefOpnd->getReg();
+ // Add a new implicit_def in loop preheader(s).
+ for (auto Pred : Loop->getHeader()->predecessors()) {
+ if (!Loop->contains(Pred)) {
+ auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
+ LLVM_DEBUG(dbgs() << *ImplicitDef);
+ (void)ImplicitDef;
+ }
+ }
+ // Make the original def partial.
+ DefOpnd->getParent()->addOperand(MachineOperand::CreateReg(
+ Reg, false, /*isImp=*/true));
+ LLVM_DEBUG(dbgs() << *DefOpnd->getParent());
+ return true;
+}
- return Modified;
+// Process a loop phi def, that is, a multi-def register where the dominating
+// def is in the loop pre-header and all other defs are in backedges. Here we
+// spot any WWM code inside that loop and then make the backedge defs partial
+// defs so the liveness goes through the WWM code.
+bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd,
+ MachineLoop *Loop) {
+ LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent());
+ // Check whether there is WWM code inside the loop.
+ bool ContainsWWM = false;
+ for (auto WWM : WWMs) {
+ if (Loop->contains(WWM->getParent())) {
+ LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM);
+ ContainsWWM = true;
+ break;
+ }
+ }
+ if (!ContainsWWM)
+ return false;
+ unsigned Reg = DefOpnd->getReg();
+ // Remove kill mark from uses.
+ for (auto &Use : MRI->use_operands(Reg))
+ Use.setIsKill(false);
+ // Make all defs except the dominating one partial defs.
+ SmallVector<MachineInstr *, 4> Defs;
+ for (auto &Def : MRI->def_instructions(Reg))
+ Defs.push_back(&Def);
+ for (auto Def : Defs) {
+ if (DefOpnd->getParent() == Def)
+ continue;
+ Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+ LLVM_DEBUG(dbgs() << *Def);
+ }
+ return true;
}
+
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
new file mode 100644
index 0000000000000..ee39eb04d8316
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -0,0 +1,231 @@
+//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
+/// Currently this will convert GLOBAL_{LOAD|STORE}_*
+/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
+/// feeding the sreg into the saddr field of the new instruction.
+/// We currently handle a REG_SEQUENCE feeding the vaddr
+/// and decompose it into a base and index.
+///
+/// Transform:
+/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
+/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
+/// %24:vgpr_32, %19:sreg_64_xexec
+/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
+/// %11:vreg_64 = COPY %16:vreg_64
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
+/// Into:
+/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
+/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-fixup-vector-isel"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableGlobalSGPRAddr(
+ "amdgpu-enable-global-sgpr-addr",
+ cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"),
+ cl::init(false));
+
+STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
+STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
+
+namespace {
+
+class SIFixupVectorISel : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFixupVectorISel() : MachineFunctionPass(ID) {
+ initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
+ "SI Fixup Vector ISel", false, false)
+
+char SIFixupVectorISel::ID = 0;
+
+char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
+
+FunctionPass *llvm::createSIFixupVectorISelPass() {
+ return new SIFixupVectorISel();
+}
+
+static bool findSRegBaseAndIndex(MachineOperand *Op,
+ unsigned &BaseReg,
+ unsigned &IndexReg,
+ MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI) {
+ SmallVector<MachineOperand *, 8> Worklist;
+ Worklist.push_back(Op);
+ while (!Worklist.empty()) {
+ MachineOperand *WOp = Worklist.pop_back_val();
+ if (!WOp->isReg() ||
+ !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+ continue;
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
+ switch (DefInst->getOpcode()) {
+ default:
+ continue;
+ case AMDGPU::COPY:
+ Worklist.push_back(&DefInst->getOperand(1));
+ break;
+ case AMDGPU::REG_SEQUENCE:
+ if (DefInst->getNumOperands() != 5)
+ continue;
+ Worklist.push_back(&DefInst->getOperand(1));
+ Worklist.push_back(&DefInst->getOperand(3));
+ break;
+ case AMDGPU::V_ADD_I32_e64:
+ // The V_ADD_* and its analogous V_ADDCV_* are generated by
+ // a previous pass which lowered from an ADD_64_PSEUDO,
+ // which generates subregs to break up the 64 bit args.
+ if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
+ continue;
+ BaseReg = DefInst->getOperand(2).getReg();
+ if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
+ continue;
+ IndexReg = DefInst->getOperand(3).getReg();
+ // Chase the IndexReg.
+ MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
+ if (!MI || !MI->isCopy())
+ continue;
+ // Make sure the reg class is 64 bit for Index.
+ // If the Index register is a subreg, we want it to reference
+ // a 64 bit register which we will use as the Index reg.
+ const TargetRegisterClass *IdxRC, *BaseRC;
+ IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
+ if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
+ continue;
+ IndexReg = MI->getOperand(1).getReg();
+ // Chase the BaseReg.
+ MI = MRI.getUniqueVRegDef(BaseReg);
+ if (!MI || !MI->isCopy())
+ continue;
+ // Make sure the register class is 64 bit for Base.
+ BaseReg = MI->getOperand(1).getReg();
+ BaseRC = MRI.getRegClass(BaseReg);
+ if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
+ continue;
+ // Make sure Base is SReg and Index is VReg.
+ if (!TRI->isSGPRReg(MRI, BaseReg))
+ return false;
+ if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
+ return false;
+ // clear any killed flags on Index and Base regs, used later.
+ MRI.clearKillFlags(IndexReg);
+ MRI.clearKillFlags(BaseReg);
+ return true;
+ }
+ }
+ return false;
+}
+
+// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
+static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ MachineRegisterInfo &MRI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI) {
+ if (!EnableGlobalSGPRAddr)
+ return false;
+ bool FuncModified = false;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+ int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
+ if (NewOpcd < 0)
+ continue;
+ // Update our statistics on opportunities seen.
+ ++NumSGPRGlobalOccurs;
+ LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
+ // Need a Base and Index or we cant transform to _SADDR.
+ unsigned BaseReg = 0;
+ unsigned IndexReg = 0;
+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
+ continue;
+ ++NumSGPRGlobalSaddrs;
+ FuncModified = true;
+ // Create the new _SADDR Memory instruction.
+ bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
+ MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ MachineInstr *NewGlob = nullptr;
+ NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
+ if (HasVdst)
+ NewGlob->addOperand(MF, MI.getOperand(0));
+ NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
+ if (VData)
+ NewGlob->addOperand(MF, *VData);
+ NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
+ NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
+
+ MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
+ // Atomics dont have a GLC, so omit the field if not there.
+ if (Glc)
+ NewGlob->addOperand(MF, *Glc);
+ NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
+ // _D16 have an vdst_in operand, copy it in.
+ MachineOperand *VDstInOp = TII->getNamedOperand(MI,
+ AMDGPU::OpName::vdst_in);
+ if (VDstInOp)
+ NewGlob->addOperand(MF, *VDstInOp);
+ NewGlob->copyImplicitOps(MF, MI);
+ NewGlob->cloneMemRefs(MF, MI);
+ // Remove the old Global Memop instruction.
+ MI.eraseFromParent();
+ LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
+ }
+ return FuncModified;
+}
+
+bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool FuncModified = false;
+ for (MachineBasicBlock &MBB : MF) {
+ // Cleanup missed Saddr opportunites from ISel.
+ FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
+ }
+ return FuncModified;
+}
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 338cabcb906bc..f4e8669583699 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -35,13 +35,16 @@ struct FoldCandidate {
uint64_t ImmToFold;
int FrameIndexToFold;
};
+ int ShrinkOpcode;
unsigned char UseOpNo;
MachineOperand::MachineOperandType Kind;
bool Commuted;
FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
- bool Commuted_ = false) :
- UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
+ bool Commuted_ = false,
+ int ShrinkOp = -1) :
+ UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
+ Kind(FoldOp->getType()),
Commuted(Commuted_) {
if (FoldOp->isImm()) {
ImmToFold = FoldOp->getImm();
@@ -68,6 +71,14 @@ struct FoldCandidate {
bool isCommuted() const {
return Commuted;
}
+
+ bool needsShrink() const {
+ return ShrinkOpcode != -1;
+ }
+
+ int getShrinkOpcode() const {
+ return ShrinkOpcode;
+ }
};
class SIFoldOperands : public MachineFunctionPass {
@@ -154,6 +165,7 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
}
static bool updateOperand(FoldCandidate &Fold,
+ const SIInstrInfo &TII,
const TargetRegisterInfo &TRI) {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
@@ -189,10 +201,49 @@ static bool updateOperand(FoldCandidate &Fold,
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
}
}
+
+ if (Fold.needsShrink()) {
+ MachineBasicBlock *MBB = MI->getParent();
+ auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
+ if (Liveness != MachineBasicBlock::LQR_Dead)
+ return false;
+
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ int Op32 = Fold.getShrinkOpcode();
+ MachineOperand &Dst0 = MI->getOperand(0);
+ MachineOperand &Dst1 = MI->getOperand(1);
+ assert(Dst0.isDef() && Dst1.isDef());
+
+ bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
+
+ const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
+ unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
+ const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
+ unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
+
+ MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
+
+ if (HaveNonDbgCarryUse) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
+ .addReg(AMDGPU::VCC, RegState::Kill);
+ }
+
+ // Keep the old instruction around to avoid breaking iterators, but
+ // replace the outputs with dummy registers.
+ Dst0.setReg(NewReg0);
+ Dst1.setReg(NewReg1);
+
+ if (Fold.isCommuted())
+ TII.commuteInstruction(*Inst32, false);
+ return true;
+ }
+
Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
+ assert(!Fold.needsShrink() && "not handled");
+
if (Fold.isFI()) {
Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
return true;
@@ -261,6 +312,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if (isUseMIInFoldList(FoldList, MI))
return false;
+ unsigned CommuteOpNo = OpNo;
+
// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.
unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
@@ -269,11 +322,12 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if (CanCommute) {
if (CommuteIdx0 == OpNo)
- OpNo = CommuteIdx1;
+ CommuteOpNo = CommuteIdx1;
else if (CommuteIdx1 == OpNo)
- OpNo = CommuteIdx0;
+ CommuteOpNo = CommuteIdx0;
}
+
// One of operands might be an Imm operand, and OpNo may refer to it after
// the call of commuteInstruction() below. Such situations are avoided
// here explicitly as OpNo must be a register operand to be a candidate
@@ -286,12 +340,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
!TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
return false;
- if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+ if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
+ if ((Opc == AMDGPU::V_ADD_I32_e64 ||
+ Opc == AMDGPU::V_SUB_I32_e64 ||
+ Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
+ OpToFold->isImm()) {
+ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+ // Verify the other operand is a VGPR, otherwise we would violate the
+ // constant bus restriction.
+ unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
+ MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+ if (!OtherOp.isReg() ||
+ !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
+ return false;
+
+ assert(MI->getOperand(1).isDef());
+
+ int Op32 = AMDGPU::getVOPe32(Opc);
+ FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
+ Op32));
+ return true;
+ }
+
TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
return false;
}
- FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
+ FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
return true;
}
@@ -362,8 +438,6 @@ void SIFoldOperands::foldOperand(
bool FoldingImm = OpToFold.isImm();
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
if (FoldingImm && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC
@@ -371,6 +445,31 @@ void SIFoldOperands::foldOperand(
MRI->getRegClass(DestReg) :
TRI->getPhysRegClass(DestReg);
+ unsigned SrcReg = UseMI->getOperand(1).getReg();
+ if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
+ TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
+ if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
+ MachineRegisterInfo::use_iterator NextUse;
+ SmallVector<FoldCandidate, 4> CopyUses;
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(DestReg), E = MRI->use_end();
+ Use != E; Use = NextUse) {
+ NextUse = std::next(Use);
+ FoldCandidate FC = FoldCandidate(Use->getParent(),
+ Use.getOperandNo(), &UseMI->getOperand(1));
+ CopyUses.push_back(FC);
+ }
+ for (auto & F : CopyUses) {
+ foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
+ FoldList, CopiesToReplace);
+ }
+ }
+ }
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+
unsigned MovOp = TII->getMovOpcode(DestRC);
if (MovOp == AMDGPU::COPY)
return;
@@ -378,6 +477,20 @@ void SIFoldOperands::foldOperand(
UseMI->setDesc(TII->get(MovOp));
CopiesToReplace.push_back(UseMI);
} else {
+ if (UseMI->isCopy() && OpToFold.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
+ TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
+ TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
+ !UseMI->getOperand(1).getSubReg()) {
+ UseMI->getOperand(1).setReg(OpToFold.getReg());
+ UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ UseMI->getOperand(1).setIsKill(false);
+ CopiesToReplace.push_back(UseMI);
+ OpToFold.setIsKill(false);
+ return;
+ }
+
const MCInstrDesc &UseDesc = UseMI->getDesc();
// Don't fold into target independent nodes. Target independent opcodes
@@ -550,6 +663,19 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
if (!Src0->isImm() && !Src1->isImm())
return false;
+ if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
+ if (Src0->isImm() && Src0->getImm() == 0) {
+ // v_lshl_or_b32 0, X, Y -> copy Y
+ // v_lshl_or_b32 0, X, K -> v_mov_b32 K
+ bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
+ MI->RemoveOperand(Src1Idx);
+ MI->RemoveOperand(Src0Idx);
+
+ MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
+ return true;
+ }
+ }
+
// and k0, k1 -> v_mov_b32 (k0 & k1)
// or k0, k1 -> v_mov_b32 (k0 | k1)
// xor k0, k1 -> v_mov_b32 (k0 ^ k1)
@@ -728,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
} else {
// Folding register.
+ SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
for (MachineRegisterInfo::use_iterator
Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
Use != E; ++Use) {
- MachineInstr *UseMI = Use->getParent();
+ UsesToProcess.push_back(Use);
+ }
+ for (auto U : UsesToProcess) {
+ MachineInstr *UseMI = U->getParent();
- foldOperand(OpToFold, UseMI, Use.getOperandNo(),
- FoldList, CopiesToReplace);
+ foldOperand(OpToFold, UseMI, U.getOperandNo(),
+ FoldList, CopiesToReplace);
}
}
@@ -744,7 +874,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);
for (FoldCandidate &Fold : FoldList) {
- if (updateOperand(Fold, *TRI)) {
+ if (updateOperand(Fold, *TII, *TRI)) {
// Clear kill flags.
if (Fold.isReg()) {
assert(Fold.OpToFold && Fold.OpToFold->isReg());
@@ -981,9 +1111,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// omod is ignored by hardware if IEEE bit is enabled. omod also does not
// correctly handle signed zeros.
//
- // TODO: Check nsz on instructions when fast math flags are preserved to MI
- // level.
- bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
+ bool IsIEEEMode = ST->enableIEEEBit(MF);
+ bool HasNSZ = MFI->hasNoSignedZerosFPMath();
for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineBasicBlock::iterator I, Next;
@@ -994,7 +1123,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
tryFoldInst(TII, &MI);
if (!TII->isFoldableCopy(MI)) {
- if (IsIEEEMode || !tryFoldOMod(MI))
+ // TODO: Omod might be OK if there is NSZ only on the source
+ // instruction, and not the omod multiply.
+ if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
+ !tryFoldOMod(MI))
tryFoldClamp(MI);
continue;
}
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index cd14239de822b..aa976d5141f86 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -168,16 +168,15 @@ void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
CoveringSubregs.push_back(Idx);
}
- llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(),
- [this](unsigned A, unsigned B) {
- LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
- LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
- unsigned NA = MaskA.getNumLanes();
- unsigned NB = MaskB.getNumLanes();
- if (NA != NB)
- return NA > NB;
- return MaskA.getHighestLane() > MaskB.getHighestLane();
- });
+ llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) {
+ LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A);
+ LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B);
+ unsigned NA = MaskA.getNumLanes();
+ unsigned NB = MaskB.getNumLanes();
+ if (NA != NB)
+ return NA > NB;
+ return MaskA.getHighestLane() > MaskB.getHighestLane();
+ });
for (unsigned Idx : CoveringSubregs) {
LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx);
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index ac0ef90f25a4f..e4633c88e18ff 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -289,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdCodeObjectV2(F)) {
+ if (ST.isAmdHsaOrMesa(F)) {
PreloadedPrivateBufferReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
}
@@ -308,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
- assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F));
+ assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
@@ -333,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
bool CopyBuffer = ResourceRegUsed &&
PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
- ST.isAmdCodeObjectV2(F) &&
+ ST.isAmdHsaOrMesa(F) &&
ScratchRsrcReg != PreloadedPrivateBufferReg;
// This needs to be careful of the copying order to avoid overwriting one of
@@ -433,7 +433,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
}
if (ST.isMesaGfxShader(Fn)
|| (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
- assert(!ST.isAmdCodeObjectV2(Fn));
+ assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 25007861fd158..0ba921647097d 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12,7 +12,7 @@
//
//===----------------------------------------------------------------------===//
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
// Provide M_PI.
#define _USE_MATH_DEFINES
#endif
@@ -156,12 +156,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v32i32, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -207,11 +209,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@@ -232,6 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+
#if 0
setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
@@ -240,7 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -339,6 +348,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FLOG, MVT::f16, Custom);
+ setOperationAction(ISD::FEXP, MVT::f16, Custom);
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
}
@@ -375,8 +385,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasBFE())
setHasExtractBitsInsn(true);
- setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+
+
+ // These are really only legal for ieee_mode functions. We should be avoiding
+ // them for functions that don't have ieee_mode enabled, so just say they are
+ // legal.
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -465,8 +487,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP2 Actions.
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
- setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+
setOperationAction(ISD::FDIV, MVT::f16, Custom);
// F16 - VOP3 Actions.
@@ -549,6 +570,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
}
if (Subtarget->hasVOP3PInsts()) {
@@ -566,8 +598,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
- setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+ setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
+ setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+
setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
@@ -587,9 +621,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+
setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
+ setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
}
@@ -623,6 +663,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMINNUM_IEEE);
+ setTargetDAGCombine(ISD::FMAXNUM_IEEE);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
@@ -638,7 +680,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
- setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -707,9 +749,7 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
if (Size == 64)
return MVT::i32;
- if (Size == 16 &&
- Subtarget->has16BitInsts() &&
- isPowerOf2_32(VT.getVectorNumElements()))
+ if (Size == 16 && Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
}
@@ -730,9 +770,8 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
if (Size == 64)
return 2 * NumElts;
- // FIXME: Fails to break down as we want with v3.
- if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
- return VT.getVectorNumElements() / 2;
+ if (Size == 16 && Subtarget->has16BitInsts())
+ return (VT.getVectorNumElements() + 1) / 2;
}
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@@ -763,10 +802,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
- if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
+ if (Size == 16 && Subtarget->has16BitInsts()) {
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
IntermediateVT = RegisterVT;
- NumIntermediates = NumElts / 2;
+ NumIntermediates = (NumElts + 1) / 2;
return NumIntermediates;
}
}
@@ -775,6 +814,47 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
+static MVT memVTFromAggregate(Type *Ty) {
+ // Only limited forms of aggregate type currently expected.
+ assert(Ty->isStructTy() && "Expected struct type");
+
+
+ Type *ElementType = nullptr;
+ unsigned NumElts;
+ if (Ty->getContainedType(0)->isVectorTy()) {
+ VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
+ ElementType = VecComponent->getElementType();
+ NumElts = VecComponent->getNumElements();
+ } else {
+ ElementType = Ty->getContainedType(0);
+ NumElts = 1;
+ }
+
+ assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
+
+ // Calculate the size of the memVT type from the aggregate
+ unsigned Pow2Elts = 0;
+ unsigned ElementSize;
+ switch (ElementType->getTypeID()) {
+ default:
+ llvm_unreachable("Unknown type!");
+ case Type::IntegerTyID:
+ ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
+ break;
+ case Type::HalfTyID:
+ ElementSize = 16;
+ break;
+ case Type::FloatTyID:
+ ElementSize = 32;
+ break;
+ }
+ unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
+ Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
+
+ return MVT::getVectorVT(MVT::getVT(ElementType, false),
+ Pow2Elts);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -802,7 +882,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MODereferenceable;
if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
+ Info.memVT = MVT::getVT(CI.getType(), true);
+ if (Info.memVT == MVT::Other) {
+ // Some intrinsics return an aggregate type - special case to work out
+ // the correct memVT
+ Info.memVT = memVTFromAggregate(CI.getType());
+ }
Info.flags |= MachineMemOperand::MOLoad;
} else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
Info.opc = ISD::INTRINSIC_VOID;
@@ -941,11 +1026,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AM.BaseGV)
return false;
- if (AS == AMDGPUASI.GLOBAL_ADDRESS)
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
- if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -983,10 +1068,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
- } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+ } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
return isLegalMUBUFAddressingMode(AM);
- } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
- AS == AMDGPUASI.REGION_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
+ AS == AMDGPUAS::REGION_ADDRESS) {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
// field.
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -1001,8 +1086,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return true;
return false;
- } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
+ } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
// For an unknown address space, this usually means that this is for some
// reason being used for pure arithmetic, and not based on some addressing
// computation. We don't have instructions that compute pointers with any
@@ -1016,12 +1101,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const {
- if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
return (MemVT.getSizeInBits() <= 4 * 32);
- } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+ } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
return (MemVT.getSizeInBits() <= MaxPrivateBits);
- } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
return (MemVT.getSizeInBits() <= 2 * 32);
}
return true;
@@ -1043,8 +1128,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
return false;
}
- if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
- AddrSpace == AMDGPUASI.REGION_ADDRESS) {
+ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
@@ -1059,17 +1144,21 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// will access scratch. If we had access to the IR function, then we
// could determine if any private memory was used in the function.
if (!Subtarget->hasUnalignedScratchAccess() &&
- (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
- AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
- return false;
+ (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+ AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+ bool AlignedBy4 = Align >= 4;
+ if (IsFast)
+ *IsFast = AlignedBy4;
+
+ return AlignedBy4;
}
if (Subtarget->hasUnalignedBufferAccess()) {
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
- *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
- AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
+ *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
+ AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
(Align % 4 == 0) : true;
}
@@ -1109,17 +1198,15 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
return MVT::Other;
}
-static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
- return AS == AMDGPUASI.GLOBAL_ADDRESS ||
- AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
- return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
- isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
+ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
@@ -1133,7 +1220,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
// Flat -> global is no-op
- if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
+ if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
return true;
return isNoopAddrSpaceCast(SrcAS, DestAS);
@@ -1146,7 +1233,7 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
}
TargetLoweringBase::LegalizeTypeAction
-SITargetLowering::getPreferredVectorAction(EVT VT) const {
+SITargetLowering::getPreferredVectorAction(MVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
return TypeSplitVector;
@@ -1200,7 +1287,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
= Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
+ MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
@@ -1240,7 +1327,7 @@ SDValue SITargetLowering::lowerKernargMemParameter(
uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
- PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
// Try to avoid using an extload by loading earlier than the argument address,
@@ -1349,7 +1436,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
- assert(!Arg->VT.isVector() && "vector type argument should have been split");
+ assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
+ "vector type argument should have been split");
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
@@ -1642,7 +1730,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.isAmdCodeObjectV2(MF.getFunction())) {
+ if (ST.isAmdHsaOrMesa(MF.getFunction())) {
if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1951,29 +2039,6 @@ SDValue SITargetLowering::LowerFormalArguments(
llvm_unreachable("Unknown loc info!");
}
- if (IsShader && Arg.VT.isVector()) {
- // Build a vector from the registers
- Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
- unsigned NumElements = ParamType->getVectorNumElements();
-
- SmallVector<SDValue, 4> Regs;
- Regs.push_back(Val);
- for (unsigned j = 1; j != NumElements; ++j) {
- Reg = ArgLocs[ArgIdx++].getLocReg();
- Reg = MF.addLiveIn(Reg, RC);
-
- SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
- Regs.push_back(Copy);
- }
-
- // Fill up the missing vector elements
- NumElements = Arg.VT.getVectorNumElements() - NumElements;
- Regs.append(NumElements, DAG.getUNDEF(VT));
-
- InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
- continue;
- }
-
InVals.push_back(Val);
}
@@ -2037,48 +2102,19 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsShader = AMDGPU::isShader(CallConv);
- Info->setIfReturnsVoid(Outs.size() == 0);
+ Info->setIfReturnsVoid(Outs.empty());
bool IsWaveEnd = Info->returnsVoid() && IsShader;
- SmallVector<ISD::OutputArg, 48> Splits;
- SmallVector<SDValue, 48> SplitVals;
-
- // Split vectors into their elements.
- for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
- const ISD::OutputArg &Out = Outs[i];
-
- if (IsShader && Out.VT.isVector()) {
- MVT VT = Out.VT.getVectorElementType();
- ISD::OutputArg NewOut = Out;
- NewOut.Flags.setSplit();
- NewOut.VT = VT;
-
- // We want the original number of vector elements here, e.g.
- // three or five, not four or eight.
- unsigned NumElements = Out.ArgVT.getVectorNumElements();
-
- for (unsigned j = 0; j != NumElements; ++j) {
- SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
- DAG.getConstant(j, DL, MVT::i32));
- SplitVals.push_back(Elem);
- Splits.push_back(NewOut);
- NewOut.PartOffset += NewOut.VT.getStoreSize();
- }
- } else {
- SplitVals.push_back(OutVals[i]);
- Splits.push_back(Out);
- }
- }
-
// CCValAssign - represent the assignment of the return value to a location.
SmallVector<CCValAssign, 48> RVLocs;
+ SmallVector<ISD::OutputArg, 48> Splits;
// CCState - Info about the registers and stack slots.
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze outgoing return values.
- CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
+ CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 48> RetOps;
@@ -2103,14 +2139,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// Copy the result values into the output registers.
- for (unsigned i = 0, realRVLocIdx = 0;
- i != RVLocs.size();
- ++i, ++realRVLocIdx) {
- CCValAssign &VA = RVLocs[i];
+ for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
+ ++I, ++RealRVLocIdx) {
+ CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// TODO: Partially return in registers if return values don't fit.
-
- SDValue Arg = SplitVals[realRVLocIdx];
+ SDValue Arg = OutVals[RealRVLocIdx];
// Copied from other backends.
switch (VA.getLocInfo()) {
@@ -2225,11 +2259,11 @@ SDValue SITargetLowering::LowerCallResult(
// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
CallLoweringInfo &CLI,
+ CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain,
- SDValue StackPtr) const {
+ SDValue Chain) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
if (!CLI.CS)
@@ -2297,9 +2331,9 @@ void SITargetLowering::passSpecialInputs(
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
} else {
- SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
- InputReg,
- OutgoingArg->getStackOffset());
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
MemOpChains.push_back(ArgStore);
}
}
@@ -2424,6 +2458,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported call to variadic function ");
}
+ if (!CLI.CS.getInstruction())
+ report_fatal_error("unsupported libcall legalization");
+
if (!CLI.CS.getCalledFunction()) {
return lowerUnhandledCall(CLI, InVals,
"unsupported indirect call to function ");
@@ -2442,8 +2479,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// The first 4 bytes are reserved for the callee's emergency stack slot.
- const unsigned CalleeUsableStackOffset = 4;
-
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2463,25 +2498,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
++NumTailCalls;
}
- if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
- // FIXME: Remove this hack for function pointer types after removing
- // support of old address space mapping. In the new address space
- // mapping the pointer in default address space is 64 bit, therefore
- // does not need this hack.
- if (Callee.getValueType() == MVT::i32) {
- const GlobalValue *GV = GA->getGlobal();
- Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
- GA->getTargetFlags());
- }
- }
- assert(Callee.getValueType() == MVT::i64);
-
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+
+ // The first 4 bytes are reserved for the callee's emergency stack slot.
+ CCInfo.AllocateStack(4, 4);
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@@ -2529,10 +2555,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
- // Stack pointer relative accesses are done by changing the offset SGPR. This
- // is just the VGPR offset component.
- SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
-
SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
@@ -2576,18 +2598,22 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset;
- SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
+ SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
+ unsigned Align = 0;
if (IsTailCall) {
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
unsigned OpSize = Flags.isByVal() ?
Flags.getByValSize() : VA.getValVT().getStoreSize();
+ // FIXME: We can have better than the minimum byval required alignment.
+ Align = Flags.isByVal() ? Flags.getByValAlign() :
+ MinAlign(Subtarget->getStackAlignment(), Offset);
+
Offset = Offset + FPDiff;
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
- DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
- StackPtr);
+ DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
// Make sure any stack arguments overlapping with where we're storing
@@ -2601,6 +2627,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
} else {
DstAddr = PtrOff;
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+ Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
@@ -2611,18 +2638,18 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
/*isVol = */ false, /*AlwaysInline = */ true,
/*isTailCall = */ false, DstInfo,
MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
- *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));
+ *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
MemOpChains.push_back(Cpy);
} else {
- SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
MemOpChains.push_back(Store);
}
}
}
// Copy special input registers after user input arguments.
- passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@@ -3460,7 +3487,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
MIB.add(MI.getOperand(I));
- MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
MI.eraseFromParent();
return BB;
}
@@ -3628,7 +3655,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerDEBUGTRAP(Op, DAG);
case ISD::FABS:
case ISD::FNEG:
+ case ISD::FCANONICALIZE:
return splitUnaryVectorOp(Op, DAG);
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -3639,10 +3670,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
case ISD::FADD:
case ISD::FMUL:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
@@ -3678,18 +3709,9 @@ static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
MemSDNode *M,
SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops,
bool IsIntrinsic) const {
SDLoc DL(M);
- SmallVector<SDValue, 10> Ops;
- Ops.reserve(M->getNumOperands());
-
- Ops.push_back(M->getOperand(0));
- if (IsIntrinsic)
- Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
-
- // Skip 1, as it is the intrinsic ID.
- for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
- Ops.push_back(M->getOperand(I));
bool Unpacked = Subtarget->hasUnpackedD16VMem();
EVT LoadVT = M->getValueType(0);
@@ -3717,6 +3739,69 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
}
+static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
+ SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
+ if (!CD)
+ return DAG.getUNDEF(VT);
+
+ int CondCode = CD->getSExtValue();
+ if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
+ CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
+ return DAG.getUNDEF(VT);
+
+ ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
+
+
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ SDLoc DL(N);
+
+ EVT CmpVT = LHS.getValueType();
+ if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
+ unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
+ ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
+ RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
+ }
+
+ ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
+
+ return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
+ DAG.getCondCode(CCOpcode));
+}
+
+static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
+ SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
+ if (!CD)
+ return DAG.getUNDEF(VT);
+
+ int CondCode = CD->getSExtValue();
+ if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
+ CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
+ return DAG.getUNDEF(VT);
+ }
+
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ EVT CmpVT = Src0.getValueType();
+ SDLoc SL(N);
+
+ if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
+ Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
+ Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
+ }
+
+ FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
+ ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
+ return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
+ Src1, DAG.getCondCode(CCOpcode));
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -3761,8 +3846,13 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
else
Opcode = AMDGPUISD::CVT_PK_U16_U32;
- SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
- Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ EVT VT = N->getValueType(0);
+ if (isTypeLegal(VT))
+ Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
+ else {
+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ }
return;
}
}
@@ -3895,15 +3985,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
- return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+ return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
- return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+ return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@@ -4038,6 +4128,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
}
+SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+ // FIXME: Assert during eslection that this is only selected for
+ // ieee_mode. Currently a combine can produce the ieee version for non-ieee
+ // mode functions, but this happens to be OK since it's only done in cases
+ // where there is known no sNaN.
+ if (IsIEEEMode)
+ return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+
+ if (VT == MVT::v4f16)
+ return splitBinaryVectorOp(Op, DAG);
+ return Op;
+}
+
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
@@ -4091,10 +4198,10 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
// FIXME: Use inline constants (src_{shared, private}_base) instead.
if (Subtarget->hasApertureRegs()) {
- unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
+ unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
- unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
+ unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
unsigned Encoding =
@@ -4119,7 +4226,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
- uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
+ uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
@@ -4127,7 +4234,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// TODO: We should use the value from the IR intrinsic call, but it might not
// be available and how do we get it?
Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
- AMDGPUASI.CONSTANT_ADDRESS));
+ AMDGPUAS::CONSTANT_ADDRESS));
MachinePointerInfo PtrInfo(V, StructOffset);
return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
@@ -4148,11 +4255,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
// flat -> local/private
- if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
unsigned DestAS = ASC->getDestAddressSpace();
- if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
- DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
+ if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
+ DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
unsigned NullVal = TM.getNullPointerValue(DestAS);
SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
@@ -4164,11 +4271,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}
// local/private -> flat
- if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+ if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
unsigned SrcAS = ASC->getSrcAddressSpace();
- if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
- SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
+ if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
unsigned NullVal = TM.getNullPointerValue(SrcAS);
SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
@@ -4335,30 +4442,39 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
}
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+ assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
- Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
- Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+ // Avoid adding defined bits with the zero_extend.
+ if (Hi.isUndef()) {
+ Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+ SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
+ return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
+ }
- Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
DAG.getConstant(16, SL, MVT::i32));
+ if (Lo.isUndef())
+ return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
- SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+ Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+ SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
return DAG.getNode(ISD::BITCAST, SL, VT, Or);
}
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.
- return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
- GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
+ return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
!shouldEmitGOTReloc(GA->getGlobal());
}
@@ -4409,18 +4525,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();
-
- if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
- GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
- // FIXME: It isn't correct to rely on the type of the pointer. This should
- // be removed when address space 0 is 64-bit.
- !GV->getType()->getElementType()->isFunctionTy())
+ if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
+ GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
EVT PtrVT = Op.getValueType();
+ // FIXME: Should not make address space based decisions here.
if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
else if (shouldEmitPCReloc(GV))
@@ -4431,11 +4544,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SIInstrInfo::MO_GOTPCREL32);
Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
- PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
const DataLayout &DataLayout = DAG.getDataLayout();
unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
- // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
- MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getGOT(DAG.getMachineFunction());
return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
MachineMemOperand::MODereferenceable |
@@ -4547,11 +4660,115 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
return Value == 0;
}
+// Re-construct the required return value for a image load intrinsic.
+// This is more complicated due to the optional use TexFailCtrl which means the required
+// return type is an aggregate
+static SDValue constructRetValue(SelectionDAG &DAG,
+ MachineSDNode *Result,
+ ArrayRef<EVT> ResultTypes,
+ bool IsTexFail, bool Unpacked, bool IsD16,
+ int DMaskPop, int NumVDataDwords,
+ const SDLoc &DL, LLVMContext &Context) {
+ // Determine the required return type. This is the same regardless of IsTexFail flag
+ EVT ReqRetVT = ResultTypes[0];
+ EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
+ int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
+ EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
+ EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
+ : AdjEltVT
+ : ReqRetVT;
+
+ // Extract data part of the result
+ // Bitcast the result to the same type as the required return type
+ int NumElts;
+ if (IsD16 && !Unpacked)
+ NumElts = NumVDataDwords << 1;
+ else
+ NumElts = NumVDataDwords;
+
+ EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
+ : AdjEltVT;
+
+ // Special case for v8f16. Rather than add support for this, use v4i32 to
+ // extract the data elements
+ bool V8F16Special = false;
+ if (CastVT == MVT::v8f16) {
+ CastVT = MVT::v4i32;
+ DMaskPop >>= 1;
+ ReqRetNumElts >>= 1;
+ V8F16Special = true;
+ AdjVT = MVT::v2i32;
+ }
+
+ SDValue N = SDValue(Result, 0);
+ SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+
+ // Iterate over the result
+ SmallVector<SDValue, 4> BVElts;
+
+ if (CastVT.isVector()) {
+ DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
+ } else {
+ BVElts.push_back(CastRes);
+ }
+ int ExtraElts = ReqRetNumElts - DMaskPop;
+ while(ExtraElts--)
+ BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+
+ SDValue PreTFCRes;
+ if (ReqRetNumElts > 1) {
+ SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
+ if (IsD16 && Unpacked)
+ PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
+ else
+ PreTFCRes = NewVec;
+ } else {
+ PreTFCRes = BVElts[0];
+ }
+
+ if (V8F16Special)
+ PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+
+ if (!IsTexFail) {
+ if (Result->getNumValues() > 1)
+ return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
+ else
+ return PreTFCRes;
+ }
+
+ // Extract the TexFail result and insert into aggregate return
+ SmallVector<SDValue, 1> TFCElt;
+ DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
+ SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
+ return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+}
+
+static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
+ SDValue *LWE, bool &IsTexFail) {
+ auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
+ if (!TexFailCtrlConst)
+ return false;
+
+ uint64_t Value = TexFailCtrlConst->getZExtValue();
+ if (Value) {
+ IsTexFail = true;
+ }
+
+ SDLoc DL(TexFailCtrlConst);
+ *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x1;
+ *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x2;
+
+ return Value == 0;
+}
+
SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const {
SDLoc DL(Op);
MachineFunction &MF = DAG.getMachineFunction();
+ const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
@@ -4559,12 +4776,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
- SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+ SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
+ SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
bool IsD16 = false;
+ bool IsA16 = false;
SDValue VData;
int NumVDataDwords;
+ bool AdjustRetType = false;
+
unsigned AddrIdx; // Index of first address argument
unsigned DMask;
+ unsigned DMaskLanes = 0;
if (BaseOpcode->Atomic) {
VData = Op.getOperand(2);
@@ -4587,7 +4809,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
AddrIdx = 3;
}
} else {
- unsigned DMaskIdx;
+ unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
+ auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+ if (!DMaskConst)
+ return Op;
+ DMask = DMaskConst->getZExtValue();
+ DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
if (BaseOpcode->Store) {
VData = Op.getOperand(2);
@@ -4603,58 +4830,91 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
- DMaskIdx = 3;
} else {
- MVT LoadVT = Op.getSimpleValueType();
+ // Work out the num dwords based on the dmask popcount and underlying type
+ // and whether packing is supported.
+ MVT LoadVT = ResultTypes[0].getSimpleVT();
if (LoadVT.getScalarType() == MVT::f16) {
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
!BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction
IsD16 = true;
- if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
- ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
}
- NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
- DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
- }
+ // Confirm that the return type is large enough for the dmask specified
+ if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
+ (!LoadVT.isVector() && DMaskLanes > 1))
+ return Op;
- auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
- if (!DMaskConst)
- return Op;
+ if (IsD16 && !Subtarget->hasUnpackedD16VMem())
+ NumVDataDwords = (DMaskLanes + 1) / 2;
+ else
+ NumVDataDwords = DMaskLanes;
- AddrIdx = DMaskIdx + 1;
- DMask = DMaskConst->getZExtValue();
- if (!DMask && !BaseOpcode->Store) {
- // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
- // store the channels' default values.
- SDValue Undef = DAG.getUNDEF(Op.getValueType());
- if (isa<MemSDNode>(Op))
- return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
- return Undef;
+ AdjustRetType = true;
}
+
+ AddrIdx = DMaskIdx + 1;
}
- unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
- (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
+ unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
+ unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
+ unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
+ NumCoords + NumLCM;
+ unsigned NumMIVAddrs = NumVAddrs;
+
SmallVector<SDValue, 4> VAddrs;
- for (unsigned i = 0; i < NumVAddrs; ++i)
- VAddrs.push_back(Op.getOperand(AddrIdx + i));
// Optimize _L to _LZ when _L is zero
if (LZMappingInfo) {
if (auto ConstantLod =
- dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) {
+ dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
- VAddrs.pop_back(); // remove 'lod'
+ NumMIVAddrs--; // remove 'lod'
}
}
}
+ // Check for 16 bit addresses and pack if true.
+ unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
+ MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
+ const MVT VAddrScalarVT = VAddrVT.getScalarType();
+ if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
+ ST->hasFeature(AMDGPU::FeatureR128A16)) {
+ IsA16 = true;
+ const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+ for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
+ SDValue AddrLo, AddrHi;
+ // Push back extra arguments.
+ if (i < DimIdx) {
+ AddrLo = Op.getOperand(i);
+ } else {
+ AddrLo = Op.getOperand(i);
+ // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
+ // in 1D, derivatives dx/dh and dx/dv are packed with undef.
+ if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
+ ((NumGradients / 2) % 2 == 1 &&
+ (i == DimIdx + (NumGradients / 2) - 1 ||
+ i == DimIdx + NumGradients - 1))) {
+ AddrHi = DAG.getUNDEF(MVT::f16);
+ } else {
+ AddrHi = Op.getOperand(i + 1);
+ i++;
+ }
+ AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
+ {AddrLo, AddrHi});
+ AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
+ }
+ VAddrs.push_back(AddrLo);
+ }
+ } else {
+ for (unsigned i = 0; i < NumMIVAddrs; ++i)
+ VAddrs.push_back(Op.getOperand(AddrIdx + i));
+ }
+
SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
@@ -4674,11 +4934,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
CtrlIdx = AddrIdx + NumVAddrs + 3;
}
+ SDValue TFE;
+ SDValue LWE;
SDValue TexFail = Op.getOperand(CtrlIdx);
- auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
- if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+ bool IsTexFail = false;
+ if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
return Op;
+ if (IsTexFail) {
+ if (!DMaskLanes) {
+ // Expecting to get an error flag since TFC is on - and dmask is 0
+ // Force dmask to be at least 1 otherwise the instruction will fail
+ DMask = 0x1;
+ DMaskLanes = 1;
+ NumVDataDwords = 1;
+ }
+ NumVDataDwords += 1;
+ AdjustRetType = true;
+ }
+
+ // Has something earlier tagged that the return type needs adjusting
+ // This happens if the instruction is a load or has set TexFailCtrl flags
+ if (AdjustRetType) {
+ // NumVDataDwords reflects the true number of dwords required in the return type
+ if (DMaskLanes == 0 && !BaseOpcode->Store) {
+ // This is a no-op load. This can be eliminated
+ SDValue Undef = DAG.getUNDEF(Op.getValueType());
+ if (isa<MemSDNode>(Op))
+ return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+ return Undef;
+ }
+
+ // Have to use a power of 2 number of dwords
+ NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
+
+ EVT NewVT = NumVDataDwords > 1 ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
+ : MVT::f32;
+
+ ResultTypes[0] = NewVT;
+ if (ResultTypes.size() == 3) {
+ // Original result was aggregate type used for TexFailCtrl results
+ // The actual instruction returns as a vector type which has now been
+ // created. Remove the aggregate result.
+ ResultTypes.erase(&ResultTypes[1]);
+ }
+ }
+
SDValue GLC;
SDValue SLC;
if (BaseOpcode->Atomic) {
@@ -4701,9 +5003,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Ops.push_back(Unorm);
Ops.push_back(GLC);
Ops.push_back(SLC);
- Ops.push_back(False); // r128
- Ops.push_back(False); // tfe
- Ops.push_back(False); // lwe
+ Ops.push_back(IsA16 && // a16 or r128
+ ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
+ Ops.push_back(TFE); // tfe
+ Ops.push_back(LWE); // lwe
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
@@ -4723,25 +5026,90 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
- MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
- *MemRefs = MemOp->getMemOperand();
- NewNode->setMemRefs(MemRefs, MemRefs + 1);
+ MachineMemOperand *MemRef = MemOp->getMemOperand();
+ DAG.setNodeMemRefs(NewNode, {MemRef});
}
if (BaseOpcode->AtomicX2) {
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
- } else if (IsD16 && !BaseOpcode->Store) {
- MVT LoadVT = Op.getSimpleValueType();
- SDValue Adjusted = adjustLoadValueTypeImpl(
- SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
- return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+ } else if (!BaseOpcode->Store) {
+ return constructRetValue(DAG, NewNode,
+ OrigResultTypes, IsTexFail,
+ Subtarget->hasUnpackedD16VMem(), IsD16,
+ DMaskLanes, NumVDataDwords, DL,
+ *DAG.getContext());
}
return SDValue(NewNode, 0);
}
+SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
+ SDValue Offset, SDValue GLC,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), VT.getStoreSize());
+
+ if (!Offset->isDivergent()) {
+ SDValue Ops[] = {
+ Rsrc,
+ Offset, // Offset
+ GLC // glc
+ };
+ return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+ DAG.getVTList(VT), Ops, VT, MMO);
+ }
+
+ // We have a divergent offset. Emit a MUBUF buffer load instead. We can
+ // assume that the buffer is unswizzled.
+ SmallVector<SDValue, 4> Loads;
+ unsigned NumLoads = 1;
+ MVT LoadVT = VT.getSimpleVT();
+ unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
+ assert((LoadVT.getScalarType() == MVT::i32 ||
+ LoadVT.getScalarType() == MVT::f32) &&
+ isPowerOf2_32(NumElts));
+
+ if (NumElts == 8 || NumElts == 16) {
+ NumLoads = NumElts == 16 ? 4 : 2;
+ LoadVT = MVT::v4i32;
+ }
+
+ SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
+ unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+
+ // Use the alignment to ensure that the required offsets will fit into the
+ // immediate offsets.
+ setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+
+ uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+ Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
+ Ops, LoadVT, MMO));
+ }
+
+ if (VT == MVT::v8i32 || VT == MVT::v16i32)
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
+
+ return Loads[0];
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -4755,14 +5123,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
- if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
+ if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
return emitNonHSAIntrinsicError(DAG, DL, VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
- if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
+ if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
@@ -4880,12 +5248,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::r600_read_tgid_z:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
- case Intrinsic::amdgcn_workitem_id_x: {
+ case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDX);
- }
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
@@ -4896,19 +5263,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
- case AMDGPUIntrinsic::SI_load_const: {
- SDValue Ops[] = {
- Op.getOperand(1),
- Op.getOperand(2)
- };
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ case SIIntrinsic::SI_load_const: {
+ SDValue Load =
+ lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
+ DAG.getTargetConstant(0, DL, MVT::i1), DAG);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
+ DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
@@ -4991,34 +5355,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Denominator, Numerator);
}
case Intrinsic::amdgcn_icmp: {
- const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
- if (!CD)
- return DAG.getUNDEF(VT);
-
- int CondCode = CD->getSExtValue();
- if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
- CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
- return DAG.getUNDEF(VT);
-
- ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
- ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
- return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
- Op.getOperand(2), DAG.getCondCode(CCOpcode));
+ // There is a Pat that handles this variant, so return it as-is.
+ if (Op.getOperand(1).getValueType() == MVT::i1 &&
+ Op.getConstantOperandVal(2) == 0 &&
+ Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
+ return Op;
+ return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
}
case Intrinsic::amdgcn_fcmp: {
- const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
- if (!CD)
- return DAG.getUNDEF(VT);
-
- int CondCode = CD->getSExtValue();
- if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
- CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
- return DAG.getUNDEF(VT);
-
- FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
- ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
- return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
- Op.getOperand(2), DAG.getCondCode(CCOpcode));
+ return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
}
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
@@ -5058,6 +5403,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
Opcode = AMDGPUISD::CVT_PK_U16_U32;
+ if (isTypeLegal(VT))
+ return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
+
SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
@@ -5127,36 +5475,104 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_buffer_load:
case Intrinsic::amdgcn_buffer_load_format: {
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
Op.getOperand(3), // vindex
- Op.getOperand(4), // offset
- Op.getOperand(5), // glc
- Op.getOperand(6) // slc
+ SDValue(), // voffset -- will be set by setBufferOffsets
+ SDValue(), // soffset -- will be set by setBufferOffsets
+ SDValue(), // offset -- will be set by setBufferOffsets
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
};
+ setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
- bool IsD16 = LoadVT.getScalarType() == MVT::f16;
- if (IsD16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_buffer_load_format: {
+ auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(4), // soffset
+ Offsets.second, // offset
+ Op.getOperand(5), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+
+ unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
+ AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+
+ EVT VT = Op.getValueType();
+ EVT IntVT = VT.changeTypeToInteger();
+ auto *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load_format: {
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idxen
+ };
+
+ unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
+ AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+
+ EVT VT = Op.getValueType();
+ EVT IntVT = VT.changeTypeToInteger();
+ auto *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand());
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
- bool IsD16 = LoadVT.getScalarType() == MVT::f16;
- if (IsD16) {
- return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
- }
+ unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+ unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -5164,12 +5580,62 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
- Op.getOperand(7), // dfmt
- Op.getOperand(8), // nfmt
- Op.getOperand(9), // glc
- Op.getOperand(10) // slc
+ DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ };
+
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_raw_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(4), // soffset
+ Offsets.second, // offset
+ Op.getOperand(5), // format
+ Op.getOperand(6), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_struct_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // format
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idxen
};
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
Op->getVTList(), Ops, LoadVT,
M->getMemOperand());
@@ -5184,14 +5650,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_buffer_atomic_xor: {
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
- Op.getOperand(5), // offset
- Op.getOperand(6) // slc
+ SDValue(), // voffset -- will be set by setBufferOffsets
+ SDValue(), // soffset -- will be set by setBufferOffsets
+ SDValue(), // offset -- will be set by setBufferOffsets
+ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
};
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
@@ -5235,16 +5709,193 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
M->getMemOperand());
}
+ case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+ EVT VT = Op.getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
+ unsigned Opcode = 0;
+
+ switch (IntrID) {
+ case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
+ break;
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idxen
+ };
+ EVT VT = Op.getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
+ unsigned Opcode = 0;
+
+ switch (IntrID) {
+ case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
+ break;
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
+ IdxEn = Idx->getZExtValue() != 0;
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // src
+ Op.getOperand(3), // cmp
+ Op.getOperand(4), // rsrc
+ Op.getOperand(5), // vindex
+ SDValue(), // voffset -- will be set by setBufferOffsets
+ SDValue(), // soffset -- will be set by setBufferOffsets
+ SDValue(), // offset -- will be set by setBufferOffsets
+ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ };
+ setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+ EVT VT = Op.getValueType();
+ auto *M = cast<MemSDNode>(Op);
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
+ Op->getVTList(), Ops, VT, M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // src
+ Op.getOperand(3), // cmp
+ Op.getOperand(4), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+ EVT VT = Op.getValueType();
+ auto *M = cast<MemSDNode>(Op);
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
+ Op->getVTList(), Ops, VT, M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
+ auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
Op.getOperand(3), // cmp
Op.getOperand(4), // rsrc
Op.getOperand(5), // vindex
- Op.getOperand(6), // offset
- Op.getOperand(7) // slc
+ Offsets.first, // voffset
+ Op.getOperand(7), // soffset
+ Offsets.second, // offset
+ Op.getOperand(8), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
@@ -5360,19 +6011,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
Op.getOperand(2), Op.getOperand(3));
}
- case AMDGPUIntrinsic::AMDGPU_kill: {
- SDValue Src = Op.getOperand(2);
- if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
- if (!K->isNegative())
- return Chain;
-
- SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
- return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
- }
-
- SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
- return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
- }
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -5383,69 +6021,79 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
return SDValue();
};
- case AMDGPUIntrinsic::SI_tbuffer_store: {
-
- // Extract vindex and voffset from vaddr as appropriate
- const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
- const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
- SDValue VAddr = Op.getOperand(5);
-
- SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-
- assert(!(OffEn->isOne() && IdxEn->isOne()) &&
- "Legacy intrinsic doesn't support both offset and index - use new version");
-
- SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
- SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
-
- // Deal with the vec-3 case
- const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
- auto Opcode = NumChannels->getZExtValue() == 3 ?
- AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
-
+ case Intrinsic::amdgcn_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
+ unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+ unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
- Chain,
- Op.getOperand(3), // vdata
- Op.getOperand(2), // rsrc
- VIndex,
- VOffset,
- Op.getOperand(6), // soffset
- Op.getOperand(7), // inst_offset
- Op.getOperand(8), // dfmt
- Op.getOperand(9), // nfmt
- Op.getOperand(12), // glc
- Op.getOperand(13), // slc
+ Chain,
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Op.getOperand(5), // voffset
+ Op.getOperand(6), // soffset
+ Op.getOperand(7), // offset
+ DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
};
-
- assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
- "Value of tfe other than zero is unsupported");
-
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(Opcode, DL,
- Op->getVTList(), Ops, VT, MMO);
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_tbuffer_store: {
+ case Intrinsic::amdgcn_struct_tbuffer_store: {
SDValue VData = Op.getOperand(2);
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
Chain,
VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
- Op.getOperand(5), // voffset
+ Offsets.first, // voffset
Op.getOperand(6), // soffset
- Op.getOperand(7), // offset
- Op.getOperand(8), // dfmt
- Op.getOperand(9), // nfmt
- Op.getOperand(10), // glc
- Op.getOperand(11) // slc
+ Offsets.second, // offset
+ Op.getOperand(7), // format
+ Op.getOperand(8), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idexen
+ };
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_raw_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Chain,
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // format
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -5460,15 +6108,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Chain,
- VData, // vdata
+ VData,
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
- Op.getOperand(5), // offset
- Op.getOperand(6), // glc
- Op.getOperand(7) // slc
+ SDValue(), // voffset -- will be set by setBufferOffsets
+ SDValue(), // soffset -- will be set by setBufferOffsets
+ SDValue(), // offset -- will be set by setBufferOffsets
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
};
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
@@ -5476,6 +6132,59 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
+
+ case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_buffer_store_format: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Chain,
+ VData,
+ Op.getOperand(3), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
+ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_struct_buffer_store:
+ case Intrinsic::amdgcn_struct_buffer_store_format: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ SDValue Ops[] = {
+ Chain,
+ VData,
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idxen
+ };
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
+ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5486,6 +6195,94 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
+// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
+// offset (the offset that is included in bounds checking and swizzling, to be
+// split between the instruction's voffset and immoffset fields) and soffset
+// (the offset that is excluded from bounds checking and swizzling, to go in
+// the instruction's soffset field). This function takes the first kind of
+// offset and figures out how to split it between voffset and immoffset.
+std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
+ SDValue Offset, SelectionDAG &DAG) const {
+ SDLoc DL(Offset);
+ const unsigned MaxImm = 4095;
+ SDValue N0 = Offset;
+ ConstantSDNode *C1 = nullptr;
+
+ if ((C1 = dyn_cast<ConstantSDNode>(N0)))
+ N0 = SDValue();
+ else if (DAG.isBaseWithConstantOffset(N0)) {
+ C1 = cast<ConstantSDNode>(N0.getOperand(1));
+ N0 = N0.getOperand(0);
+ }
+
+ if (C1) {
+ unsigned ImmOffset = C1->getZExtValue();
+ // If the immediate value is too big for the immoffset field, put the value
+ // and -4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.
+ // However, do not do that rounding down to a multiple of 4096 if that is a
+ // negative number, as it appears to be illegal to have a negative offset
+ // in the vgpr, even if adding the immediate offset makes it positive.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ if ((int32_t)Overflow < 0) {
+ Overflow += ImmOffset;
+ ImmOffset = 0;
+ }
+ C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
+ if (Overflow) {
+ auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
+ if (!N0)
+ N0 = OverflowVal;
+ else {
+ SDValue Ops[] = { N0, OverflowVal };
+ N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
+ }
+ }
+ }
+ if (!N0)
+ N0 = DAG.getConstant(0, DL, MVT::i32);
+ if (!C1)
+ C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
+ return {N0, SDValue(C1, 0)};
+}
+
+// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
+// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+// pointed to by Offsets.
+void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+ SelectionDAG &DAG, SDValue *Offsets,
+ unsigned Align) const {
+ SDLoc DL(CombinedOffset);
+ if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
+ uint32_t Imm = C->getZExtValue();
+ uint32_t SOffset, ImmOffset;
+ if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
+ Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
+ Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
+ return;
+ }
+ }
+ if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
+ SDValue N0 = CombinedOffset.getOperand(0);
+ SDValue N1 = CombinedOffset.getOperand(1);
+ uint32_t SOffset, ImmOffset;
+ int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
+ if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+ Subtarget, Align)) {
+ Offsets[0] = N0;
+ Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
+ Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
+ return;
+ }
+ }
+ Offsets[0] = CombinedOffset;
+ Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
+}
+
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
ISD::LoadExtType ExtType, SDValue Op,
const SDLoc &SL, EVT VT) {
@@ -5513,8 +6310,8 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const
// FIXME: Constant loads should all be marked invariant.
unsigned AS = Ld->getAddressSpace();
- if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
- AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+ if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
(AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
return SDValue();
@@ -5625,15 +6422,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
- if (AS == AMDGPUASI.FLAT_ADDRESS)
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
AS = MFI->hasFlatScratchInit() ?
- AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
+ AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
unsigned NumElements = MemVT.getVectorNumElements();
- if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
- if (!Op->isDivergent() && Alignment >= 4)
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
@@ -5641,28 +6438,28 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
- AS == AMDGPUASI.GLOBAL_ADDRESS) {
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
- Alignment >= 4)
+ Alignment >= 4 && NumElements < 32)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
- AS == AMDGPUASI.GLOBAL_ADDRESS ||
- AS == AMDGPUASI.FLAT_ADDRESS) {
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
// v4 loads are supported for private and global memory.
return SDValue();
}
- if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
// Depending on the setting of the private_element_size field in the
// resource descriptor, we can only make private accesses up to a certain
// size.
@@ -5681,7 +6478,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
default:
llvm_unreachable("unsupported private_element_size");
}
- } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
// Use ds_read_b128 if possible.
if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
MemVT.getStoreSize() == 16)
@@ -5689,6 +6486,17 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (NumElements > 2)
return SplitVectorLoad(Op, DAG);
+
+ // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+ // address is negative, then the instruction is incorrectly treated as
+ // out-of-bounds even if base + offsets is in bounds. Split vectorized
+ // loads here to avoid emitting ds_read2_b32. We may re-combine the
+ // load later in the SILoadStoreOptimizer.
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ NumElements == 2 && MemVT.getStoreSize() == 8 &&
+ Load->getAlignment() < 8) {
+ return SplitVectorLoad(Op, DAG);
+ }
}
return SDValue();
}
@@ -6058,17 +6866,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
- if (AS == AMDGPUASI.FLAT_ADDRESS)
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
AS = MFI->hasFlatScratchInit() ?
- AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
+ AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
unsigned NumElements = VT.getVectorNumElements();
- if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
- AS == AMDGPUASI.FLAT_ADDRESS) {
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorStore(Op, DAG);
return SDValue();
- } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+ } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
switch (Subtarget->getMaxPrivateElementSize()) {
case 4:
return scalarizeVectorStore(Store, DAG);
@@ -6083,7 +6891,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
default:
llvm_unreachable("unsupported private_element_size");
}
- } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
// Use ds_write_b128 if possible.
if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
VT.getStoreSize() == 16)
@@ -6091,6 +6899,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (NumElements > 2)
return SplitVectorStore(Op, DAG);
+
+ // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+ // address is negative, then the instruction is incorrectly treated as
+ // out-of-bounds even if base + offsets is in bounds. Split vectorized
+ // stores here to avoid emitting ds_write2_b32. We may re-combine the
+ // store later in the SILoadStoreOptimizer.
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ NumElements == 2 && VT.getStoreSize() == 8 &&
+ Store->getAlignment() < 8) {
+ return SplitVectorStore(Op, DAG);
+ }
+
return SDValue();
} else {
llvm_unreachable("unhandled address space");
@@ -6101,17 +6921,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
+ SDValue TrigVal;
+
// TODO: Should this propagate fast-math-flags?
- SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
- DAG.getNode(ISD::FMUL, DL, VT, Arg,
- DAG.getConstantFP(0.5/M_PI, DL,
- VT)));
+
+ SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
+
+ if (Subtarget->hasTrigReducedRange()) {
+ SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+ TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
+ } else {
+ TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+ }
switch (Op.getOpcode()) {
case ISD::FCOS:
- return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+ return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
case ISD::FSIN:
- return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+ return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
default:
llvm_unreachable("Wrong trig opcode");
}
@@ -6123,7 +6950,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
unsigned AS = AtomicNode->getAddressSpace();
// No custom lowering required for local address space
- if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
+ if (!isFlatGlobalAddrSpace(AS))
return Op;
// Non-local address space requires custom lowering for atomic compare
@@ -6475,6 +7302,29 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
}
}
+ if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
+ std::swap(LHS, RHS);
+
+ if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.hasOneUse()) {
+ ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
+ // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
+ const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
+ (RHS.getOperand(0) == LHS.getOperand(0) &&
+ LHS.getOperand(0) == LHS.getOperand(1))) {
+ const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
+ unsigned NewMask = LCC == ISD::SETO ?
+ Mask->getZExtValue() & ~OrdMask :
+ Mask->getZExtValue() & OrdMask;
+
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
+ DAG.getConstant(NewMask, DL, MVT::i32));
+ }
+ }
+
if (VT == MVT::i32 &&
(RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
// and x, (sext cc from i1) => select cc, x, 0
@@ -6798,158 +7648,294 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
return AMDGPUTargetLowering::performRcpCombine(N, DCI);
}
-static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
- if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
+ unsigned MaxDepth) const {
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::FCANONICALIZE)
return true;
- return DAG.isKnownNeverNaN(Op);
-}
+ if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+ auto F = CFP->getValueAPF();
+ if (F.isNaN() && F.isSignaling())
+ return false;
+ return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
+ }
-static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
- const GCNSubtarget *ST, unsigned MaxDepth=5) {
// If source is a result of another standard FP operation it is already in
// canonical form.
+ if (MaxDepth == 0)
+ return false;
- switch (Op.getOpcode()) {
- default:
- break;
-
+ switch (Opcode) {
// These will flush denorms if required.
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
- case ISD::FSQRT:
case ISD::FCEIL:
case ISD::FFLOOR:
case ISD::FMA:
case ISD::FMAD:
-
- case ISD::FCANONICALIZE:
- return true;
-
+ case ISD::FSQRT:
+ case ISD::FDIV:
+ case ISD::FREM:
case ISD::FP_ROUND:
- return Op.getValueType().getScalarType() != MVT::f16 ||
- ST->hasFP16Denormals();
-
case ISD::FP_EXTEND:
- return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
- ST->hasFP16Denormals();
+ case AMDGPUISD::FMUL_LEGACY:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RSQ_CLAMP:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::TRIG_PREOP:
+ case AMDGPUISD::DIV_SCALE:
+ case AMDGPUISD::DIV_FMAS:
+ case AMDGPUISD::DIV_FIXUP:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::LDEXP:
+ case AMDGPUISD::CVT_PKRTZ_F16_F32:
+ case AMDGPUISD::CVT_F32_UBYTE0:
+ case AMDGPUISD::CVT_F32_UBYTE1:
+ case AMDGPUISD::CVT_F32_UBYTE2:
+ case AMDGPUISD::CVT_F32_UBYTE3:
+ return true;
// It can/will be lowered or combined as a bit operation.
// Need to check their input recursively to handle.
case ISD::FNEG:
case ISD::FABS:
- return (MaxDepth > 0) &&
- isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
+ case ISD::FCOPYSIGN:
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
case ISD::FSIN:
case ISD::FCOS:
case ISD::FSINCOS:
return Op.getValueType().getScalarType() != MVT::f16;
- // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
- // For such targets need to check their input recursively.
case ISD::FMINNUM:
case ISD::FMAXNUM:
- case ISD::FMINNAN:
- case ISD::FMAXNAN:
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXNUM_IEEE:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMIN3: {
+ // FIXME: Shouldn't treat the generic operations different based these.
+ // However, we aren't really required to flush the result from
+ // minnum/maxnum..
- if (ST->supportsMinMaxDenormModes() &&
- DAG.isKnownNeverNaN(Op.getOperand(0)) &&
- DAG.isKnownNeverNaN(Op.getOperand(1)))
+ // snans will be quieted, so we only need to worry about denormals.
+ if (Subtarget->supportsMinMaxDenormModes() ||
+ denormalsEnabledForType(Op.getValueType()))
return true;
- return (MaxDepth > 0) &&
- isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
- isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
+ // Flushing may be required.
+ // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
+ // targets need to check their input recursively.
+
+ // FIXME: Does this apply with clamp? It's implemented with max.
+ for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+ if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+ return false;
+ }
- case ISD::ConstantFP: {
- auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
- return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
+ return true;
}
+ case ISD::SELECT: {
+ return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
+ isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
}
- return false;
+ case ISD::BUILD_VECTOR: {
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ SDValue SrcOp = Op.getOperand(i);
+ if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
+ return false;
+ }
+
+ return true;
+ }
+ case ISD::EXTRACT_VECTOR_ELT:
+ case ISD::EXTRACT_SUBVECTOR: {
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ }
+ case ISD::INSERT_VECTOR_ELT: {
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
+ isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+ }
+ case ISD::UNDEF:
+ // Could be anything.
+ return false;
+
+ case ISD::BITCAST: {
+ // Hack round the mess we make when legalizing extract_vector_elt
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueType() == MVT::i16 &&
+ Src.getOpcode() == ISD::TRUNCATE) {
+ SDValue TruncSrc = Src.getOperand(0);
+ if (TruncSrc.getValueType() == MVT::i32 &&
+ TruncSrc.getOpcode() == ISD::BITCAST &&
+ TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
+ return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+ }
+ }
+
+ return false;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrinsicID
+ = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ // TODO: Handle more intrinsics
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cubeid:
+ case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_fdot2:
+ return true;
+ default:
+ break;
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ default:
+ return denormalsEnabledForType(Op.getValueType()) &&
+ DAG.isKnownNeverSNaN(Op);
+ }
+
+ llvm_unreachable("invalid operation");
}
// Constant fold canonicalize.
+SDValue SITargetLowering::getCanonicalConstantFP(
+ SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
+ // Flush denormals to 0 if not enabled.
+ if (C.isDenormal() && !denormalsEnabledForType(VT))
+ return DAG.getConstantFP(0.0, SL, VT);
+
+ if (C.isNaN()) {
+ APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+ if (C.isSignaling()) {
+ // Quiet a signaling NaN.
+ // FIXME: Is this supposed to preserve payload bits?
+ return DAG.getConstantFP(CanonicalQNaN, SL, VT);
+ }
+
+ // Make sure it is the canonical NaN bitpattern.
+ //
+ // TODO: Can we use -1 as the canonical NaN value since it's an inline
+ // immediate?
+ if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+ return DAG.getConstantFP(CanonicalQNaN, SL, VT);
+ }
+
+ // Already canonical.
+ return DAG.getConstantFP(C, SL, VT);
+}
+
+static bool vectorEltWillFoldAway(SDValue Op) {
+ return Op.isUndef() || isa<ConstantFPSDNode>(Op);
+}
+
SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
// fcanonicalize undef -> qnan
if (N0.isUndef()) {
- EVT VT = N->getValueType(0);
APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
return DAG.getConstantFP(QNaN, SDLoc(N), VT);
}
- ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0);
- if (!CFP) {
- SDValue N0 = N->getOperand(0);
- EVT VT = N0.getValueType().getScalarType();
- auto ST = getSubtarget();
+ if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
+ EVT VT = N->getValueType(0);
+ return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
+ }
- if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
- (VT == MVT::f64 && ST->hasFP64Denormals()) ||
- (VT == MVT::f16 && ST->hasFP16Denormals())) &&
- DAG.isKnownNeverNaN(N0))
- return N0;
+ // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
+ // (fcanonicalize k)
+ //
+ // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
- bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+ // TODO: This could be better with wider vectors that will be split to v2f16,
+ // and to consider uses since there aren't that many packed operations.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
+ isTypeLegal(MVT::v2f16)) {
+ SDLoc SL(N);
+ SDValue NewElts[2];
+ SDValue Lo = N0.getOperand(0);
+ SDValue Hi = N0.getOperand(1);
+ EVT EltVT = Lo.getValueType();
- if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
- isCanonicalized(DAG, N0, ST))
- return N0;
+ if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
+ for (unsigned I = 0; I != 2; ++I) {
+ SDValue Op = N0.getOperand(I);
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+ NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
+ CFP->getValueAPF());
+ } else if (Op.isUndef()) {
+ // Handled below based on what the other operand is.
+ NewElts[I] = Op;
+ } else {
+ NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
+ }
+ }
- return SDValue();
- }
+ // If one half is undef, and one is constant, perfer a splat vector rather
+ // than the normal qNaN. If it's a register, prefer 0.0 since that's
+ // cheaper to use and may be free with a packed operation.
+ if (NewElts[0].isUndef()) {
+ if (isa<ConstantFPSDNode>(NewElts[1]))
+ NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
+ NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
+ }
- const APFloat &C = CFP->getValueAPF();
+ if (NewElts[1].isUndef()) {
+ NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
+ NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
+ }
- // Flush denormals to 0 if not enabled.
- if (C.isDenormal()) {
- EVT VT = N->getValueType(0);
- EVT SVT = VT.getScalarType();
- if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
- return DAG.getConstantFP(0.0, SDLoc(N), VT);
+ return DAG.getBuildVector(VT, SL, NewElts);
+ }
+ }
- if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
- return DAG.getConstantFP(0.0, SDLoc(N), VT);
+ unsigned SrcOpc = N0.getOpcode();
- if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
- return DAG.getConstantFP(0.0, SDLoc(N), VT);
- }
+ // If it's free to do so, push canonicalizes further up the source, which may
+ // find a canonical source.
+ //
+ // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
+ // sNaNs.
+ if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
+ auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+ if (CRHS && N0.hasOneUse()) {
+ SDLoc SL(N);
+ SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
+ N0.getOperand(0));
+ SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
+ DCI.AddToWorklist(Canon0.getNode());
- if (C.isNaN()) {
- EVT VT = N->getValueType(0);
- APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
- if (C.isSignaling()) {
- // Quiet a signaling NaN.
- return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+ return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
}
-
- // Make sure it is the canonical NaN bitpattern.
- //
- // TODO: Can we use -1 as the canonical NaN value since it's an inline
- // immediate?
- if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
- return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
}
- return N0;
+ return isCanonicalized(DAG, N0) ? N0 : SDValue();
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
+ case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
return AMDGPUISD::UMAX3;
case ISD::FMINNUM:
+ case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
@@ -7044,11 +8030,18 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
// then give the other result, which is different from med3 with a NaN
// input.
SDValue Var = Op0.getOperand(0);
- if (!isKnownNeverSNan(DAG, Var))
+ if (!DAG.isKnownNeverSNaN(Var))
return SDValue();
- return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
- Var, SDValue(K0, 0), SDValue(K1, 0));
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ if ((!K0->hasOneUse() ||
+ TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
+ (!K1->hasOneUse() ||
+ TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
+ return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+ Var, SDValue(K0, 0), SDValue(K1, 0));
+ }
}
return SDValue();
@@ -7109,6 +8102,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+ (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
@@ -7216,9 +8210,11 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
switch(Opc) {
default:
- return SDValue();
+ break;
// TODO: Support other binary operations.
case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
case ISD::ADD:
case ISD::UMIN:
case ISD::UMAX:
@@ -7226,25 +8222,54 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::SMAX:
case ISD::FMAXNUM:
case ISD::FMINNUM:
- return DAG.getNode(Opc, SL, EltVT,
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
- Vec.getOperand(0), Idx),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
- Vec.getOperand(1), Idx));
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE: {
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(0), Idx);
+ SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(1), Idx);
+
+ DCI.AddToWorklist(Elt0.getNode());
+ DCI.AddToWorklist(Elt1.getNode());
+ return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
+ }
}
}
- if (!DCI.isBeforeLegalize())
- return SDValue();
-
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
+ // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+ // This elminates non-constant index and subsequent movrel or scratch access.
+ // Sub-dword vectors of size 2 dword or less have better implementation.
+ // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+ // instructions.
+ if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
+ !isa<ConstantSDNode>(N->getOperand(1))) {
+ SDLoc SL(N);
+ SDValue Idx = N->getOperand(1);
+ EVT IdxVT = Idx.getValueType();
+ SDValue V;
+ for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+ SDValue IC = DAG.getConstant(I, SL, IdxVT);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+ if (I == 0)
+ V = Elt;
+ else
+ V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+ }
+ return V;
+ }
+
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
// Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
// elements. This exposes more load reduction opportunities by replacing
// multiple small extract_vector_elements with a single 32-bit extract.
auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (EltSize <= 16 &&
+ if (isa<MemSDNode>(Vec) &&
+ EltSize <= 16 &&
EltVT.isByteSized() &&
VecSize > 32 &&
VecSize % 32 == 0 &&
@@ -7274,46 +8299,40 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
return SDValue();
}
-static bool convertBuildVectorCastElt(SelectionDAG &DAG,
- SDValue &Lo, SDValue &Hi) {
- if (Hi.getOpcode() == ISD::BITCAST &&
- Hi.getOperand(0).getValueType() == MVT::f16 &&
- (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
- Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
- Hi = Hi.getOperand(0);
- return true;
- }
-
- return false;
-}
-
-SDValue SITargetLowering::performBuildVectorCombine(
- SDNode *N, DAGCombinerInfo &DCI) const {
- SDLoc SL(N);
+SDValue
+SITargetLowering::performInsertVectorEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Vec = N->getOperand(0);
+ SDValue Idx = N->getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
- if (!isTypeLegal(MVT::v2i16))
+ // INSERT_VECTOR_ELT (<n x e>, var-idx)
+ // => BUILD_VECTOR n x select (e, const-idx)
+ // This elminates non-constant index and subsequent movrel or scratch access.
+ // Sub-dword vectors of size 2 dword or less have better implementation.
+ // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+ // instructions.
+ if (isa<ConstantSDNode>(Idx) ||
+ VecSize > 256 || (VecSize <= 64 && EltSize < 32))
return SDValue();
- SelectionDAG &DAG = DCI.DAG;
- EVT VT = N->getValueType(0);
-
- if (VT == MVT::v2i16) {
- SDValue Lo = N->getOperand(0);
- SDValue Hi = N->getOperand(1);
- // v2i16 build_vector (const|undef), (bitcast f16:$x)
- // -> bitcast (v2f16 build_vector const|undef, $x
- if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
- SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
- return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
- }
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ SDValue Ins = N->getOperand(1);
+ EVT IdxVT = Idx.getValueType();
- if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
- SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
- return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
- }
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+ SDValue IC = DAG.getConstant(I, SL, IdxVT);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+ SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
+ Ops.push_back(V);
}
- return SDValue();
+ return DAG.getBuildVector(VecVT, SL, Ops);
}
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
@@ -7568,7 +8587,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
EVT VT = N->getValueType(0);
SDLoc SL(N);
- if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+ if (!Subtarget->hasDotInsts() || VT != MVT::f32)
return SDValue();
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -7705,16 +8724,26 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
VT != MVT::f16))
return SDValue();
- // Match isinf pattern
+ // Match isinf/isfinite pattern
// (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
- if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+ // (fcmp one (fabs x), inf) -> (fp_class x,
+ // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
+ if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
if (!CRHS)
return SDValue();
const APFloat &APF = CRHS->getValueAPF();
if (APF.isInfinity() && !APF.isNegative()) {
- unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+ const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
+ SIInstrFlags::N_INFINITY;
+ const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
+ SIInstrFlags::P_ZERO |
+ SIInstrFlags::N_NORMAL |
+ SIInstrFlags::P_NORMAL |
+ SIInstrFlags::N_SUBNORMAL |
+ SIInstrFlags::P_SUBNORMAL;
+ unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
DAG.getConstant(Mask, SL, MVT::i32));
}
@@ -7759,8 +8788,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
- TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
+ if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
}
@@ -7792,6 +8820,9 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+ return SDValue();
+
switch (N->getOpcode()) {
default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
@@ -7810,17 +8841,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:
case ISD::FMINNUM:
+ case ISD::FMAXNUM_IEEE:
+ case ISD::FMINNUM_IEEE:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN:
case AMDGPUISD::FMIN_LEGACY:
- case AMDGPUISD::FMAX_LEGACY: {
- if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
- getTargetMachine().getOptLevel() > CodeGenOpt::None)
- return performMinMaxCombine(N, DCI);
- break;
- }
+ case AMDGPUISD::FMAX_LEGACY:
+ return performMinMaxCombine(N, DCI);
case ISD::FMA:
return performFMACombine(N, DCI);
case ISD::LOAD: {
@@ -7912,8 +8941,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);
- case ISD::BUILD_VECTOR:
- return performBuildVectorCombine(N, DCI);
+ case ISD::INSERT_VECTOR_ELT:
+ return performInsertVectorEltCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
@@ -7926,6 +8955,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {
case AMDGPU::sub1: return 1;
case AMDGPU::sub2: return 2;
case AMDGPU::sub3: return 3;
+ case AMDGPU::sub4: return 4; // Possible with TFE/LWE
}
}
@@ -7939,11 +8969,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
return Node; // not implemented for D16
- SDNode *Users[4] = { nullptr };
+ SDNode *Users[5] = { nullptr };
unsigned Lane = 0;
unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
+ unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
+ unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
+ bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+ Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
+ unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
if (OldDmask == 0) {
@@ -7951,6 +8986,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
return Node;
}
+ unsigned OldBitsSet = countPopulation(OldDmask);
+ // Work out which is the TFE/LWE lane if that is enabled.
+ if (UsesTFC) {
+ TFCLane = OldBitsSet;
+ }
+
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
I != E; ++I) {
@@ -7970,28 +9011,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// set, etc.
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
- // Set which texture component corresponds to the lane.
- unsigned Comp;
- for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
- Comp = countTrailingZeros(Dmask);
- Dmask &= ~(1 << Comp);
- }
+ // Check if the use is for the TFE/LWE generated result at VGPRn+1.
+ if (UsesTFC && Lane == TFCLane) {
+ Users[Lane] = *I;
+ } else {
+ // Set which texture component corresponds to the lane.
+ unsigned Comp;
+ for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
+ Comp = countTrailingZeros(Dmask);
+ Dmask &= ~(1 << Comp);
+ }
- // Abort if we have more than one user per component
- if (Users[Lane])
- return Node;
+ // Abort if we have more than one user per component.
+ if (Users[Lane])
+ return Node;
- Users[Lane] = *I;
- NewDmask |= 1 << Comp;
+ Users[Lane] = *I;
+ NewDmask |= 1 << Comp;
+ }
}
+ // Don't allow 0 dmask, as hardware assumes one channel enabled.
+ bool NoChannels = !NewDmask;
+ if (NoChannels) {
+ // If the original dmask has one channel - then nothing to do
+ if (OldBitsSet == 1)
+ return Node;
+ // Use an arbitrary dmask - required for the instruction to work
+ NewDmask = 1;
+ }
// Abort if there's no change
if (NewDmask == OldDmask)
return Node;
unsigned BitsSet = countPopulation(NewDmask);
- int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
+ // Check for TFE or LWE - increase the number of channels by one to account
+ // for the extra return value
+ // This will need adjustment for D16 if this is also included in
+ // adjustWriteMask (this function) but at present D16 are excluded.
+ unsigned NewChannels = BitsSet + UsesTFC;
+
+ int NewOpcode =
+ AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
assert(NewOpcode != -1 &&
NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
"failed to find equivalent MIMG op");
@@ -8004,8 +9066,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
- MVT ResultVT = BitsSet == 1 ?
- SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
+ MVT ResultVT = NewChannels == 1 ?
+ SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
+ NewChannels == 5 ? 8 : NewChannels);
SDVTList NewVTList = HasChain ?
DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
@@ -8015,11 +9078,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
if (HasChain) {
// Update chain.
- NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
+ DAG.setNodeMemRefs(NewNode, Node->memoperands());
DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
}
- if (BitsSet == 1) {
+ if (NewChannels == 1) {
assert(Node->hasNUsesOfValue(1, 0));
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
SDLoc(Node), Users[Lane]->getValueType(0),
@@ -8029,19 +9092,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
}
// Update the users of the node with the new indices
- for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+ for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
SDNode *User = Users[i];
- if (!User)
- continue;
-
- SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
- DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+ if (!User) {
+ // Handle the special case of NoChannels. We set NewDmask to 1 above, but
+ // Users[0] is still nullptr because channel 0 doesn't really have a use.
+ if (i || !NoChannels)
+ continue;
+ } else {
+ SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+ DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+ }
switch (Idx) {
default: break;
case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+ case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
}
}
@@ -8457,49 +9525,56 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
}
+LLVM_ATTRIBUTE_UNUSED
+static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
+ assert(N->getOpcode() == ISD::CopyFromReg);
+ do {
+ // Follow the chain until we find an INLINEASM node.
+ N = N->getOperand(0).getNode();
+ if (N->getOpcode() == ISD::INLINEASM)
+ return true;
+ } while (N->getOpcode() == ISD::CopyFromReg);
+ return false;
+}
+
bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
- FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+ FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
{
switch (N->getOpcode()) {
- case ISD::Register:
case ISD::CopyFromReg:
{
- const RegisterSDNode *R = nullptr;
- if (N->getOpcode() == ISD::Register) {
- R = dyn_cast<RegisterSDNode>(N);
- }
- else {
- R = dyn_cast<RegisterSDNode>(N->getOperand(1));
- }
- if (R)
- {
- const MachineFunction * MF = FLI->MF;
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
- unsigned Reg = R->getReg();
- if (TRI.isPhysicalRegister(Reg))
- return TRI.isVGPR(MRI, Reg);
+ const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
+ const MachineFunction * MF = FLI->MF;
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+ unsigned Reg = R->getReg();
+ if (TRI.isPhysicalRegister(Reg))
+ return !TRI.isSGPRReg(MRI, Reg);
- if (MRI.isLiveIn(Reg)) {
- // workitem.id.x workitem.id.y workitem.id.z
- // Any VGPR formal argument is also considered divergent
- if (TRI.isVGPR(MRI, Reg))
- return true;
- // Formal arguments of non-entry functions
- // are conservatively considered divergent
- else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
- return true;
- }
- return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+ if (MRI.isLiveIn(Reg)) {
+ // workitem.id.x workitem.id.y workitem.id.z
+ // Any VGPR formal argument is also considered divergent
+ if (!TRI.isSGPRReg(MRI, Reg))
+ return true;
+ // Formal arguments of non-entry functions
+ // are conservatively considered divergent
+ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+ return true;
+ return false;
}
+ const Value *V = FLI->getValueFromVirtualReg(Reg);
+ if (V)
+ return KDA->isDivergent(V);
+ assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+ return !TRI.isSGPRReg(MRI, Reg);
}
break;
case ISD::LOAD: {
- const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
- if (L->getMemOperand()->getAddrSpace() ==
- Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
- return true;
+ const LoadSDNode *L = cast<LoadSDNode>(N);
+ unsigned AS = L->getAddressSpace();
+ // A flat load may access private memory.
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
} break;
case ISD::CALLSEQ_END:
return true;
@@ -8522,3 +9597,30 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
}
return false;
}
+
+bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
+ switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+ case MVT::f32:
+ return Subtarget->hasFP32Denormals();
+ case MVT::f64:
+ return Subtarget->hasFP64Denormals();
+ case MVT::f16:
+ return Subtarget->hasFP16Denormals();
+ default:
+ return false;
+ }
+}
+
+bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ bool SNaN,
+ unsigned Depth) const {
+ if (Op.getOpcode() == AMDGPUISD::CLAMP) {
+ if (Subtarget->enableDX10Clamp())
+ return true; // Clamped to 0.
+ return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+ }
+
+ return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
+ SNaN, Depth);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 5b3d49b3d8e30..bcef519ee6635 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -60,11 +60,22 @@ private:
MVT VT, unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const;
+ SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
+ SDValue GLC, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+ // The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
+ // (the offset that is included in bounds checking and swizzling, to be split
+ // between the instruction's voffset and immoffset fields) and soffset (the
+ // offset that is excluded from bounds checking and swizzling, to go in the
+ // instruction's soffset field). This function takes the first kind of
+ // offset and figures out how to split it between voffset and immoffset.
+ std::pair<SDValue, SDValue> splitBufferOffsets(SDValue Offset,
+ SelectionDAG &DAG) const;
+
SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -81,7 +92,7 @@ private:
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
- SelectionDAG &DAG,
+ SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
@@ -99,6 +110,7 @@ private:
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;
@@ -130,6 +142,8 @@ private:
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
+ const APFloat &C) const;
SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
@@ -140,7 +154,7 @@ private:
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
@@ -156,7 +170,6 @@ private:
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
- bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;
@@ -175,6 +188,12 @@ private:
/// global value \p GV, false otherwise.
bool shouldEmitPCReloc(const GlobalValue *GV) const;
+ // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
+ // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+ // pointed to by Offsets.
+ void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+ SDValue *Offsets, unsigned Align = 4) const;
+
public:
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI);
@@ -192,6 +211,7 @@ public:
SmallVectorImpl<Value*> &/*Ops*/,
Type *&/*AccessTy*/) const override;
+ bool isLegalGlobalAddressingMode(const AddrMode &AM) const;
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I = nullptr) const override;
@@ -215,7 +235,7 @@ public:
bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
TargetLoweringBase::LegalizeTypeAction
- getPreferredVectorAction(EVT VT) const override;
+ getPreferredVectorAction(MVT VT) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
@@ -248,11 +268,11 @@ public:
void passSpecialInputs(
CallLoweringInfo &CLI,
+ CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain,
- SDValue StackPtr) const;
+ SDValue Chain) const;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
@@ -322,7 +342,16 @@ public:
unsigned Depth = 0) const override;
bool isSDNodeSourceOfDivergence(const SDNode *N,
- FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override;
+ FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
+
+ bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+ unsigned MaxDepth = 5) const;
+ bool denormalsEnabledForType(EVT VT) const;
+
+ bool isKnownNeverNaNForTargetNode(SDValue Op,
+ const SelectionDAG &DAG,
+ bool SNaN = false,
+ unsigned Depth = 0) const override;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index dc9397cf7b85e..ba21a5ce1293a 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -66,6 +66,8 @@ private:
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
+ bool optimizeVccBranch(MachineInstr &MI) const;
+
public:
static char ID;
@@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
return true;
}
+bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
+ // Match:
+ // sreg = -1
+ // vcc = S_AND_B64 exec, sreg
+ // S_CBRANCH_VCC[N]Z
+ // =>
+ // S_CBRANCH_EXEC[N]Z
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI.getParent();
+ const unsigned CondReg = AMDGPU::VCC;
+ const unsigned ExecReg = AMDGPU::EXEC;
+ const unsigned And = AMDGPU::S_AND_B64;
+
+ MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+ E = MBB.rend();
+ bool ReadsCond = false;
+ unsigned Threshold = 5;
+ for (++A ; A != E ; ++A) {
+ if (!--Threshold)
+ return false;
+ if (A->modifiesRegister(ExecReg, TRI))
+ return false;
+ if (A->modifiesRegister(CondReg, TRI)) {
+ if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
+ return false;
+ break;
+ }
+ ReadsCond |= A->readsRegister(CondReg, TRI);
+ }
+ if (A == E)
+ return false;
+
+ MachineOperand &Op1 = A->getOperand(1);
+ MachineOperand &Op2 = A->getOperand(2);
+ if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+ TII->commuteInstruction(*A);
+ Changed = true;
+ }
+ if (Op1.getReg() != ExecReg)
+ return Changed;
+ if (Op2.isImm() && Op2.getImm() != -1)
+ return Changed;
+
+ unsigned SReg = AMDGPU::NoRegister;
+ if (Op2.isReg()) {
+ SReg = Op2.getReg();
+ auto M = std::next(A);
+ bool ReadsSreg = false;
+ for ( ; M != E ; ++M) {
+ if (M->definesRegister(SReg, TRI))
+ break;
+ if (M->modifiesRegister(SReg, TRI))
+ return Changed;
+ ReadsSreg |= M->readsRegister(SReg, TRI);
+ }
+ if (M == E ||
+ !M->isMoveImmediate() ||
+ !M->getOperand(1).isImm() ||
+ M->getOperand(1).getImm() != -1)
+ return Changed;
+ // First if sreg is only used in and instruction fold the immediate
+ // into that and.
+ if (!ReadsSreg && Op2.isKill()) {
+ A->getOperand(2).ChangeToImmediate(-1);
+ M->eraseFromParent();
+ }
+ }
+
+ if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+ MI.killsRegister(CondReg, TRI))
+ A->eraseFromParent();
+
+ bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+ if (SReg == ExecReg) {
+ if (IsVCCZ) {
+ MI.eraseFromParent();
+ return true;
+ }
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else {
+ MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
+ : AMDGPU::S_CBRANCH_EXECNZ));
+ }
+
+ MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+ MI.addImplicitDefUseOperands(*MBB.getParent());
+
+ return true;
+}
+
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -384,7 +476,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
kill(MI);
if (ExecBranchStack.empty()) {
- if (skipIfDead(MI, *NextBB)) {
+ if (NextBB != BE && skipIfDead(MI, *NextBB)) {
HaveSkipBlock = true;
NextBB = std::next(BI);
BE = MF.end();
@@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
}
break;
+ case AMDGPU::S_CBRANCH_VCCZ:
+ case AMDGPU::S_CBRANCH_VCCNZ:
+ MadeChange |= optimizeVccBranch(MI);
+ break;
+
default:
break;
}
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index d456e3d9b94d0..afc0b44676109 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -13,6 +13,14 @@
/// Memory reads and writes are issued asynchronously, so we need to insert
/// S_WAITCNT instructions when we want to access any of their results or
/// overwrite any register that's used asynchronously.
+///
+/// TODO: This pass currently keeps one timeline per hardware counter. A more
+/// finely-grained approach that keeps one timeline per event type could
+/// sometimes get away with generating weaker s_waitcnt instructions. For
+/// example, when both SMEM and LDS are in flight and we need to wait for
+/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
+/// but the pass will currently generate a conservative lgkmcnt(0) because
+/// multiple event types are in flight.
//
//===----------------------------------------------------------------------===//
@@ -33,7 +41,6 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -69,6 +76,25 @@ static cl::opt<unsigned> ForceEmitZeroFlag(
namespace {
+template <typename EnumT>
+class enum_iterator
+ : public iterator_facade_base<enum_iterator<EnumT>,
+ std::forward_iterator_tag, const EnumT> {
+ EnumT Value;
+public:
+ enum_iterator() = default;
+ enum_iterator(EnumT Value) : Value(Value) {}
+
+ enum_iterator &operator++() {
+ Value = static_cast<EnumT>(Value + 1);
+ return *this;
+ }
+
+ bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
+
+ EnumT operator*() const { return Value; }
+};
+
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
// s_waitcnt instruction needs to be emited.
@@ -77,12 +103,17 @@ namespace {
enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
+ return make_range(enum_iterator<InstCounterType>(VM_CNT),
+ enum_iterator<InstCounterType>(NUM_INST_CNTS));
+}
+
using RegInterval = std::pair<signed, signed>;
struct {
- int32_t VmcntMax;
- int32_t ExpcntMax;
- int32_t LgkmcntMax;
+ uint32_t VmcntMax;
+ uint32_t ExpcntMax;
+ uint32_t LgkmcntMax;
int32_t NumVGPRsMax;
int32_t NumSGPRsMax;
} HardwareLimits;
@@ -108,6 +139,14 @@ enum WaitEventType {
NUM_WAIT_EVENTS,
};
+static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+ (1 << VMEM_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+ (1 << SQ_MESSAGE),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+};
+
// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
@@ -122,30 +161,38 @@ enum RegisterMapping {
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};
-#define ForAllWaitEventType(w) \
- for (enum WaitEventType w = (enum WaitEventType)0; \
- (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
- (w) = (enum WaitEventType)((w) + 1))
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+ switch (T) {
+ case VM_CNT:
+ Wait.VmCnt = std::min(Wait.VmCnt, Count);
+ break;
+ case EXP_CNT:
+ Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
+ break;
+ case LGKM_CNT:
+ Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
+ break;
+ default:
+ llvm_unreachable("bad InstCounterType");
+ }
+}
-// This is a per-basic-block object that maintains current score brackets
-// of each wait counter, and a per-register scoreboard for each wait counter.
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
// We also maintain the latest score for every event type that can change the
// waitcnt in order to know if there are multiple types of events within
// the brackets. When multiple types of event happen in the bracket,
// wait count may get decreased out of order, therefore we need to put in
// "s_waitcnt 0" before use.
-class BlockWaitcntBrackets {
+class WaitcntBrackets {
public:
- BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
+ WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
+ for (auto T : inst_counter_types())
memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- }
}
- ~BlockWaitcntBrackets() = default;
-
- static int32_t getWaitCountMax(InstCounterType T) {
+ static uint32_t getWaitCountMax(InstCounterType T) {
switch (T) {
case VM_CNT:
return HardwareLimits.VmcntMax;
@@ -159,33 +206,14 @@ public:
return 0;
}
- void setScoreLB(InstCounterType T, int32_t Val) {
- assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return;
- ScoreLBs[T] = Val;
- }
-
- void setScoreUB(InstCounterType T, int32_t Val) {
- assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return;
- ScoreUBs[T] = Val;
- if (T == EXP_CNT) {
- int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
- if (ScoreLBs[T] < UB)
- ScoreLBs[T] = UB;
- }
- }
-
- int32_t getScoreLB(InstCounterType T) {
+ uint32_t getScoreLB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
if (T >= NUM_INST_CNTS)
return 0;
return ScoreLBs[T];
}
- int32_t getScoreUB(InstCounterType T) {
+ uint32_t getScoreUB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
if (T >= NUM_INST_CNTS)
return 0;
@@ -194,89 +222,56 @@ public:
// Mapping from event to counter.
InstCounterType eventCounter(WaitEventType E) {
- switch (E) {
- case VMEM_ACCESS:
+ if (E == VMEM_ACCESS)
return VM_CNT;
- case LDS_ACCESS:
- case GDS_ACCESS:
- case SQ_MESSAGE:
- case SMEM_ACCESS:
+ if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
return LGKM_CNT;
- case EXP_GPR_LOCK:
- case GDS_GPR_LOCK:
- case VMW_GPR_LOCK:
- case EXP_POS_ACCESS:
- case EXP_PARAM_ACCESS:
- return EXP_CNT;
- default:
- llvm_unreachable("unhandled event type");
- }
- return NUM_INST_CNTS;
- }
-
- void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
- if (GprNo < NUM_ALL_VGPRS) {
- if (GprNo > VgprUB) {
- VgprUB = GprNo;
- }
- VgprScores[T][GprNo] = Val;
- } else {
- assert(T == LGKM_CNT);
- if (GprNo - NUM_ALL_VGPRS > SgprUB) {
- SgprUB = GprNo - NUM_ALL_VGPRS;
- }
- SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
- }
+ assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
+ return EXP_CNT;
}
- int32_t getRegScore(int GprNo, InstCounterType T) {
+ uint32_t getRegScore(int GprNo, InstCounterType T) {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
+ assert(T == LGKM_CNT);
return SgprScores[GprNo - NUM_ALL_VGPRS];
}
void clear() {
memset(ScoreLBs, 0, sizeof(ScoreLBs));
memset(ScoreUBs, 0, sizeof(ScoreUBs));
- memset(EventUBs, 0, sizeof(EventUBs));
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
+ PendingEvents = 0;
+ memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
+ for (auto T : inst_counter_types())
memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- }
memset(SgprScores, 0, sizeof(SgprScores));
}
+ bool merge(const WaitcntBrackets &Other);
+
RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI, unsigned OpNo,
bool Def) const;
- void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
- const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
- unsigned OpNo, int32_t Val);
-
- void setWaitAtBeginning() { WaitAtBeginning = true; }
- void clearWaitAtBeginning() { WaitAtBeginning = false; }
- bool getWaitAtBeginning() const { return WaitAtBeginning; }
- void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
int32_t getMaxVGPR() const { return VgprUB; }
int32_t getMaxSGPR() const { return SgprUB; }
- int32_t getEventUB(enum WaitEventType W) const {
- assert(W < NUM_WAIT_EVENTS);
- return EventUBs[W];
- }
-
- bool counterOutOfOrder(InstCounterType T);
- unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+ bool counterOutOfOrder(InstCounterType T) const;
+ bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+ void determineWait(InstCounterType T, uint32_t ScoreToWait,
+ AMDGPU::Waitcnt &Wait) const;
+ void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+ void applyWaitcnt(InstCounterType T, unsigned Count);
void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, WaitEventType E,
MachineInstr &MI);
- bool hasPendingSMEM() const {
- return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
- EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+ bool hasPending() const { return PendingEvents != 0; }
+ bool hasPendingEvent(WaitEventType E) const {
+ return PendingEvents & (1 << E);
}
bool hasPendingFlat() const {
@@ -291,75 +286,71 @@ public:
LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
}
- int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
-
- void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+ void print(raw_ostream &);
+ void dump() { print(dbgs()); }
- bool getRevisitLoop() const { return RevisitLoop; }
- void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+private:
+ struct MergeInfo {
+ uint32_t OldLB;
+ uint32_t OtherLB;
+ uint32_t MyShift;
+ uint32_t OtherShift;
+ };
+ static bool mergeScore(const MergeInfo &M, uint32_t &Score,
+ uint32_t OtherScore);
- void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
- int32_t getPostOrder() const { return PostOrder; }
+ void setScoreLB(InstCounterType T, uint32_t Val) {
+ assert(T < NUM_INST_CNTS);
+ if (T >= NUM_INST_CNTS)
+ return;
+ ScoreLBs[T] = Val;
+ }
- void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
- void clearWaitcnt() { Waitcnt = nullptr; }
- MachineInstr *getWaitcnt() const { return Waitcnt; }
+ void setScoreUB(InstCounterType T, uint32_t Val) {
+ assert(T < NUM_INST_CNTS);
+ if (T >= NUM_INST_CNTS)
+ return;
+ ScoreUBs[T] = Val;
+ if (T == EXP_CNT) {
+ uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+ if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
+ ScoreLBs[T] = UB;
+ }
+ }
- bool mixedExpTypes() const { return MixedExpTypes; }
- void setMixedExpTypes(bool MixedExpTypesIn) {
- MixedExpTypes = MixedExpTypesIn;
+ void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+ if (GprNo < NUM_ALL_VGPRS) {
+ if (GprNo > VgprUB) {
+ VgprUB = GprNo;
+ }
+ VgprScores[T][GprNo] = Val;
+ } else {
+ assert(T == LGKM_CNT);
+ if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+ SgprUB = GprNo - NUM_ALL_VGPRS;
+ }
+ SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+ }
}
- void print(raw_ostream &);
- void dump() { print(dbgs()); }
+ void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+ unsigned OpNo, uint32_t Val);
-private:
const GCNSubtarget *ST = nullptr;
- bool WaitAtBeginning = false;
- bool RevisitLoop = false;
- bool MixedExpTypes = false;
- int32_t PostOrder = 0;
- MachineInstr *Waitcnt = nullptr;
- int32_t ScoreLBs[NUM_INST_CNTS] = {0};
- int32_t ScoreUBs[NUM_INST_CNTS] = {0};
- int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+ uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
+ uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
+ uint32_t PendingEvents = 0;
+ bool MixedPendingEvents[NUM_INST_CNTS] = {false};
// Remember the last flat memory operation.
- int32_t LastFlat[NUM_INST_CNTS] = {0};
+ uint32_t LastFlat[NUM_INST_CNTS] = {0};
// wait_cnt scores for every vgpr.
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
int32_t VgprUB = 0;
int32_t SgprUB = 0;
- int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+ uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
- int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
-};
-
-// This is a per-loop-region object that records waitcnt status at the end of
-// loop footer from the previous iteration. We also maintain an iteration
-// count to track the number of times the loop has been visited. When it
-// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
-// at the end of the loop footer.
-class LoopWaitcntData {
-public:
- LoopWaitcntData() = default;
- ~LoopWaitcntData() = default;
-
- void incIterCnt() { IterCnt++; }
- void resetIterCnt() { IterCnt = 0; }
- unsigned getIterCnt() { return IterCnt; }
-
- void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
- MachineInstr *getWaitcnt() const { return LfWaitcnt; }
-
- void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
-
-private:
- // s_waitcnt added at the end of loop footer to stablize wait scores
- // at the end of the loop footer.
- MachineInstr *LfWaitcnt = nullptr;
- // Number of iterations the loop has been visited, not including the initial
- // walk over.
- int32_t IterCnt = 0;
+ uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -368,22 +359,21 @@ private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
- const MachineLoopInfo *MLI = nullptr;
- AMDGPU::IsaInfo::IsaVersion IV;
- AMDGPUAS AMDGPUASI;
+ AMDGPU::IsaVersion IV;
- DenseSet<MachineBasicBlock *> BlockVisitedSet;
DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseSet<MachineInstr *> VCCZBugHandledSet;
- DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
- BlockWaitcntBracketsMap;
+ struct BlockInfo {
+ MachineBasicBlock *MBB;
+ std::unique_ptr<WaitcntBrackets> Incoming;
+ bool Dirty = true;
- std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
+ explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
+ };
- DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
-
- std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+ std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
+ DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
// because of amdgpu-waitcnt-forcezero flag
@@ -407,20 +397,11 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
- void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
- // The waitcnt information is copied because it changes as the block is
- // traversed.
- KillWaitBrackets.push_back(
- llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
- }
-
bool isForceEmitWaitcnt() const {
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1))
+ for (auto T : inst_counter_types())
if (ForceEmitWaitcnt[T])
return true;
return false;
@@ -454,27 +435,22 @@ public:
}
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
- void generateWaitcntInstBefore(MachineInstr &MI,
- BlockWaitcntBrackets *ScoreBrackets);
+ bool generateWaitcntInstBefore(MachineInstr &MI,
+ WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr);
void updateEventWaitcntAfter(MachineInstr &Inst,
- BlockWaitcntBrackets *ScoreBrackets);
- void mergeInputScoreBrackets(MachineBasicBlock &Block);
- bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
- unsigned countNumBottomBlocks(const MachineLoop *Loop);
- void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
- void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
- bool isWaitcntStronger(unsigned LHS, unsigned RHS);
- unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
+ WaitcntBrackets *ScoreBrackets);
+ bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets);
};
} // end anonymous namespace
-RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
- const SIInstrInfo *TII,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
- unsigned OpNo,
- bool Def) const {
+RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
+ const SIInstrInfo *TII,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI,
+ unsigned OpNo, bool Def) const {
const MachineOperand &Op = MI->getOperand(OpNo);
if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
(Def && !Op.isDef()))
@@ -512,11 +488,11 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
return Result;
}
-void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- unsigned OpNo, int32_t Val) {
+void WaitcntBrackets::setExpScore(const MachineInstr *MI,
+ const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, unsigned OpNo,
+ uint32_t Val) {
RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
LLVM_DEBUG({
const MachineOperand &Opnd = MI->getOperand(OpNo);
@@ -527,26 +503,26 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
}
}
-void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- WaitEventType E, MachineInstr &Inst) {
+void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+ const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI,
+ WaitEventType E, MachineInstr &Inst) {
const MachineRegisterInfo &MRIA = *MRI;
InstCounterType T = eventCounter(E);
- int32_t CurrScore = getScoreUB(T) + 1;
- // EventUB and ScoreUB need to be update regardless if this event changes
- // the score of a register or not.
+ uint32_t CurrScore = getScoreUB(T) + 1;
+ if (CurrScore == 0)
+ report_fatal_error("InsertWaitcnt score wraparound");
+ // PendingEvents and ScoreUB need to be update regardless if this event
+ // changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- EventUBs[E] = CurrScore;
+ if (!hasPendingEvent(E)) {
+ if (PendingEvents & WaitEventMaskForInst[T])
+ MixedPendingEvents[T] = true;
+ PendingEvents |= 1 << E;
+ }
setScoreUB(T, CurrScore);
if (T == EXP_CNT) {
- // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
- // is required.
- if (!MixedExpTypes) {
- MixedExpTypes = counterOutOfOrder(EXP_CNT);
- }
-
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
@@ -671,12 +647,11 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
}
-void BlockWaitcntBrackets::print(raw_ostream &OS) {
+void WaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- int LB = getScoreLB(T);
- int UB = getScoreUB(T);
+ for (auto T : inst_counter_types()) {
+ uint32_t LB = getScoreLB(T);
+ uint32_t UB = getScoreUB(T);
switch (T) {
case VM_CNT:
@@ -696,10 +671,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
if (LB < UB) {
// Print vgpr scores.
for (int J = 0; J <= getMaxVGPR(); J++) {
- int RegScore = getRegScore(J, T);
+ uint32_t RegScore = getRegScore(J, T);
if (RegScore <= LB)
continue;
- int RelScore = RegScore - LB - 1;
+ uint32_t RelScore = RegScore - LB - 1;
if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
OS << RelScore << ":v" << J << " ";
} else {
@@ -709,10 +684,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
// Also need to print sgpr scores for lgkm_cnt.
if (T == LGKM_CNT) {
for (int J = 0; J <= getMaxSGPR(); J++) {
- int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+ uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
if (RegScore <= LB)
continue;
- int RelScore = RegScore - LB - 1;
+ uint32_t RelScore = RegScore - LB - 1;
OS << RelScore << ":s" << J << " ";
}
}
@@ -722,23 +697,31 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
}
-unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
- int ScoreToWait) {
- unsigned int NeedWait = 0;
- if (ScoreToWait == -1) {
- // The score to wait is unknown. This implies that it was not encountered
- // during the path of the CFG walk done during the current traversal but
- // may be seen on a different path. Emit an s_wait counter with a
- // conservative value of 0 for the counter.
- NeedWait = CNT_MASK(T);
- setScoreLB(T, getScoreUB(T));
- return NeedWait;
- }
+/// Simplify the waitcnt, in the sense of removing redundant counts, and return
+/// whether a waitcnt instruction is needed at all.
+bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
+ return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
+ simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
+ simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
+}
+
+bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
+ unsigned &Count) const {
+ const uint32_t LB = getScoreLB(T);
+ const uint32_t UB = getScoreUB(T);
+ if (Count < UB && UB - Count > LB)
+ return true;
+ Count = ~0u;
+ return false;
+}
+
+void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+ AMDGPU::Waitcnt &Wait) const {
// If the score of src_operand falls within the bracket, we need an
// s_waitcnt instruction.
- const int32_t LB = getScoreLB(T);
- const int32_t UB = getScoreUB(T);
+ const uint32_t LB = getScoreLB(T);
+ const uint32_t UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == VM_CNT || T == LGKM_CNT) &&
hasPendingFlat() &&
@@ -746,90 +729,46 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
// to force a waitcnt 0.
- NeedWait = CNT_MASK(T);
- setScoreLB(T, getScoreUB(T));
+ addWait(Wait, T, 0);
} else if (counterOutOfOrder(T)) {
// Counter can get decremented out-of-order when there
// are multiple types event in the bracket. Also emit an s_wait counter
// with a conservative value of 0 for the counter.
- NeedWait = CNT_MASK(T);
- setScoreLB(T, getScoreUB(T));
+ addWait(Wait, T, 0);
} else {
- NeedWait = CNT_MASK(T);
- setScoreLB(T, ScoreToWait);
+ addWait(Wait, T, UB - ScoreToWait);
}
}
+}
- return NeedWait;
+void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
+ applyWaitcnt(VM_CNT, Wait.VmCnt);
+ applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
}
-// Where there are multiple types of event in the bracket of a counter,
-// the decrement may go out of order.
-bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
- switch (T) {
- case VM_CNT:
- return false;
- case LGKM_CNT: {
- if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
- EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
- // Scalar memory read always can go out of order.
- return true;
- }
- int NumEventTypes = 0;
- if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
- EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
- NumEventTypes++;
- }
- if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
- EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
- NumEventTypes++;
- }
- if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
- EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
- NumEventTypes++;
- }
- if (NumEventTypes <= 1) {
- return false;
- }
- break;
+void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
+ const uint32_t UB = getScoreUB(T);
+ if (Count >= UB)
+ return;
+ if (Count != 0) {
+ if (counterOutOfOrder(T))
+ return;
+ setScoreLB(T, std::max(getScoreLB(T), UB - Count));
+ } else {
+ setScoreLB(T, UB);
+ MixedPendingEvents[T] = false;
+ PendingEvents &= ~WaitEventMaskForInst[T];
}
- case EXP_CNT: {
- // If there has been a mixture of export types, then a waitcnt exp(0) is
- // required.
- if (MixedExpTypes)
- return true;
- int NumEventTypes = 0;
- if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
- EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
- NumEventTypes++;
- }
- if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
- EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
- NumEventTypes++;
- }
- if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
- EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
- NumEventTypes++;
- }
- if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
- EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
- NumEventTypes++;
- }
-
- if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
- EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
- NumEventTypes++;
- }
+}
- if (NumEventTypes <= 1) {
- return false;
- }
- break;
- }
- default:
- break;
- }
- return true;
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
+ // Scalar memory read always can go out of order.
+ if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+ return true;
+ return MixedPendingEvents[T];
}
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -851,29 +790,6 @@ static bool readsVCCZ(const MachineInstr &MI) {
!MI.getOperand(1).isUndef();
}
-/// Given wait count encodings checks if LHS is stronger than RHS.
-bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
- if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
- return false;
- if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
- return false;
- if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
- return false;
- return true;
-}
-
-/// Given wait count encodings create a new encoding which is stronger
-/// or equal to both.
-unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
- unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
- AMDGPU::decodeVmcnt(IV, RHS));
- unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
- AMDGPU::decodeLgkmcnt(IV, RHS));
- unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
- AMDGPU::decodeExpcnt(IV, RHS));
- return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
-}
-
/// Generate s_waitcnt instruction to be placed before cur_Inst.
/// Instructions of a given type are returned in order,
/// but instructions of different types can complete out of order.
@@ -884,51 +800,23 @@ unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
/// and if so what the value of each counter is.
/// The "score bracket" is bound by the lower bound and upper bound
/// scores (*_score_LB and *_score_ub respectively).
-void SIInsertWaitcnts::generateWaitcntInstBefore(
- MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
- // To emit, or not to emit - that's the question!
- // Start with an assumption that there is no need to emit.
- unsigned int EmitWaitcnt = 0;
-
- // No need to wait before phi. If a phi-move exists, then the wait should
- // has been inserted before the move. If a phi-move does not exist, then
- // wait should be inserted before the real use. The same is true for
- // sc-merge. It is not a coincident that all these cases correspond to the
- // instructions that are skipped in the assembling loop.
- bool NeedLineMapping = false; // TODO: Check on this.
-
- // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
- bool ForceEmitZeroWaitcnt = false;
-
+bool SIInsertWaitcnts::generateWaitcntInstBefore(
+ MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
+ MachineInstr *OldWaitcntInstr) {
setForceEmitWaitcnt();
bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
- if (MI.isDebugInstr() &&
- // TODO: any other opcode?
- !NeedLineMapping) {
- return;
- }
+ if (MI.isDebugInstr())
+ return false;
- // See if an s_waitcnt is forced at block entry, or is needed at
- // program end.
- if (ScoreBrackets->getWaitAtBeginning()) {
- // Note that we have already cleared the state, so we don't need to update
- // it.
- ScoreBrackets->clearWaitAtBeginning();
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- EmitWaitcnt |= CNT_MASK(T);
- ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
- }
- }
+ AMDGPU::Waitcnt Wait;
// See if this instruction has a forced S_WAITCNT VM.
// TODO: Handle other cases of NeedsWaitcntVmBefore()
- else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
- MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
- EmitWaitcnt |=
- ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+ if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+ MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+ MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+ Wait.VmCnt = 0;
}
// All waits must be resolved at call return.
@@ -936,23 +824,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
// with knowledge of the called routines.
if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
- ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
- EmitWaitcnt |= CNT_MASK(T);
- }
- }
+ Wait = AMDGPU::Waitcnt::allZero();
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
AMDGPU::SendMsg::ID_GS_DONE)) {
- if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
- ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- EmitWaitcnt |= CNT_MASK(VM_CNT);
- }
+ Wait.VmCnt = 0;
}
#if 0 // TODO: the following blocks of logic when we have fence.
else if (MI.getOpcode() == SC_FENCE) {
@@ -1016,14 +895,12 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
// Export and GDS are tracked individually, either may trigger a waitcnt
// for EXEC.
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+ if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
+ ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
+ ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
+ ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
+ Wait.ExpCnt = 0;
+ }
}
#if 0 // TODO: the following code to handle CALL.
@@ -1051,27 +928,27 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
// instruction.
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUASI.LOCAL_ADDRESS)
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
}
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = MI.getOperand(I);
const MachineRegisterInfo &MRIA = *MRI;
RegInterval Interval =
- ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(MRIA, Op.getReg())) {
// VM_CNT is only relevant to vgpr or LDS.
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
}
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+ ScoreBrackets.determineWait(
+ LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
}
// End of for loop that looks at all source operands to decide vm_wait_cnt
@@ -1086,29 +963,29 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
// FIXME: Should not be relying on memoperands.
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUASI.LOCAL_ADDRESS)
+ if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.determineWait(
+ EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
}
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
MachineOperand &Def = MI.getOperand(I);
const MachineRegisterInfo &MRIA = *MRI;
RegInterval Interval =
- ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
if (TRI->isVGPR(MRIA, Def.getReg())) {
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.determineWait(
+ EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+ ScoreBrackets.determineWait(
+ LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
} // End of for loop that looks at all dest operands.
}
@@ -1119,182 +996,79 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
// requiring a WAITCNT beforehand.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
- EmitWaitcnt |=
- ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- EmitWaitcnt |= ScoreBrackets->updateByWait(
- LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
+ Wait = AMDGPU::Waitcnt::allZero();
}
// TODO: Remove this work-around, enable the assert for Bug 457939
// after fixing the scheduler. Also, the Shader Compiler code is
// independent of target.
if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
- if (ScoreBrackets->getScoreLB(LGKM_CNT) <
- ScoreBrackets->getScoreUB(LGKM_CNT) &&
- ScoreBrackets->hasPendingSMEM()) {
- // Wait on everything, not just LGKM. vccz reads usually come from
- // terminators, and we always wait on everything at the end of the
- // block, so if we only wait on LGKM here, we might end up with
- // another s_waitcnt inserted right after this if there are non-LGKM
- // instructions still outstanding.
- // FIXME: this is too conservative / the comment is wrong.
- // We don't wait on everything at the end of the block and we combine
- // waitcnts so we should never have back-to-back waitcnts.
- ForceEmitZeroWaitcnt = true;
- EmitWaitcnt = true;
+ if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ Wait.LgkmCnt = 0;
}
}
- // Does this operand processing indicate s_wait counter update?
- if (EmitWaitcnt || IsForceEmitWaitcnt) {
- int CntVal[NUM_INST_CNTS];
-
- bool UseDefaultWaitcntStrategy = true;
- if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
- // Force all waitcnts to 0.
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
- }
- CntVal[VM_CNT] = 0;
- CntVal[EXP_CNT] = 0;
- CntVal[LGKM_CNT] = 0;
- UseDefaultWaitcntStrategy = false;
- }
-
- if (UseDefaultWaitcntStrategy) {
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- if (EmitWaitcnt & CNT_MASK(T)) {
- int Delta =
- ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
- int MaxDelta = ScoreBrackets->getWaitCountMax(T);
- if (Delta >= MaxDelta) {
- Delta = -1;
- if (T != EXP_CNT) {
- ScoreBrackets->setScoreLB(
- T, ScoreBrackets->getScoreUB(T) - MaxDelta);
- }
- EmitWaitcnt &= ~CNT_MASK(T);
- }
- CntVal[T] = Delta;
- } else {
- // If we are not waiting for a particular counter then encode
- // it as -1 which means "don't care."
- CntVal[T] = -1;
- }
+ // Early-out if no wait is indicated.
+ if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
+ bool Modified = false;
+ if (OldWaitcntInstr) {
+ if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
+ TrackedWaitcntSet.erase(OldWaitcntInstr);
+ OldWaitcntInstr->eraseFromParent();
+ Modified = true;
+ } else {
+ int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
+ ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
}
+ Modified = true;
}
+ return Modified;
+ }
- // If we are not waiting on any counter we can skip the wait altogether.
- if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
- MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
- int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
- if (!OldWaitcnt ||
- (AMDGPU::decodeVmcnt(IV, Imm) !=
- (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
- (AMDGPU::decodeExpcnt(IV, Imm) !=
- (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
- (AMDGPU::decodeLgkmcnt(IV, Imm) !=
- (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
- MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
- if (ContainingLoop) {
- MachineBasicBlock *TBB = ContainingLoop->getHeader();
- BlockWaitcntBrackets *ScoreBracket =
- BlockWaitcntBracketsMap[TBB].get();
- if (!ScoreBracket) {
- assert(!BlockVisitedSet.count(TBB));
- BlockWaitcntBracketsMap[TBB] =
- llvm::make_unique<BlockWaitcntBrackets>(ST);
- ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
- }
- ScoreBracket->setRevisitLoop(true);
- LLVM_DEBUG(dbgs()
- << "set-revisit2: Block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
- }
- }
+ if (ForceEmitZeroWaitcnts)
+ Wait = AMDGPU::Waitcnt::allZero();
- // Update an existing waitcount, or make a new one.
- unsigned Enc = AMDGPU::encodeWaitcnt(IV,
- ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
- ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
- ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
- // We don't remove waitcnts that existed prior to the waitcnt
- // pass. Check if the waitcnt to-be-inserted can be avoided
- // or if the prev waitcnt can be updated.
- bool insertSWaitInst = true;
- for (MachineBasicBlock::iterator I = MI.getIterator(),
- B = MI.getParent()->begin();
- insertSWaitInst && I != B; --I) {
- if (I == MI.getIterator())
- continue;
+ if (ForceEmitWaitcnt[VM_CNT])
+ Wait.VmCnt = 0;
+ if (ForceEmitWaitcnt[EXP_CNT])
+ Wait.ExpCnt = 0;
+ if (ForceEmitWaitcnt[LGKM_CNT])
+ Wait.LgkmCnt = 0;
- switch (I->getOpcode()) {
- case AMDGPU::S_WAITCNT:
- if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
- insertSWaitInst = false;
- else if (!OldWaitcnt) {
- OldWaitcnt = &*I;
- Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
- }
- break;
- // TODO: skip over instructions which never require wait.
- }
- break;
- }
- if (insertSWaitInst) {
- if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
- if (ForceEmitZeroWaitcnts)
- LLVM_DEBUG(
- dbgs()
- << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
- if (IsForceEmitWaitcnt)
- LLVM_DEBUG(dbgs()
- << "Force emit a s_waitcnt due to debug counter\n");
+ ScoreBrackets.applyWaitcnt(Wait);
- OldWaitcnt->getOperand(0).setImm(Enc);
- if (!OldWaitcnt->getParent())
- MI.getParent()->insert(MI, OldWaitcnt);
+ AMDGPU::Waitcnt OldWait;
+ if (OldWaitcntInstr) {
+ OldWait =
+ AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
+ }
+ if (OldWait.dominates(Wait))
+ return false;
- LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
- << "Old Instr: " << MI << '\n'
- << "New Instr: " << *OldWaitcnt << '\n');
- } else {
- auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
- MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(Enc);
- TrackedWaitcntSet.insert(SWaitInst);
+ if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
+ Wait = Wait.combined(OldWait);
- LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
- << "Old Instr: " << MI << '\n'
- << "New Instr: " << *SWaitInst << '\n');
- }
- }
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ if (OldWaitcntInstr) {
+ OldWaitcntInstr->getOperand(0).setImm(Enc);
- if (CntVal[EXP_CNT] == 0) {
- ScoreBrackets->setMixedExpTypes(false);
- }
- }
- }
-}
+ LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *OldWaitcntInstr << '\n');
+ } else {
+ auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+ MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(Enc);
+ TrackedWaitcntSet.insert(SWaitInst);
-void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
- MachineInstr *Waitcnt) {
- if (MBB.empty()) {
- MBB.push_back(Waitcnt);
- return;
+ LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+ << "Old Instr: " << MI << '\n'
+ << "New Instr: " << *SWaitInst << '\n');
}
- MachineBasicBlock::iterator It = MBB.end();
- MachineInstr *MI = &*(--It);
- if (MI->isBranch()) {
- MBB.insert(It, Waitcnt);
- } else {
- MBB.push_back(Waitcnt);
- }
+ return true;
}
// This is a flat memory operation. Check to see if it has memory
@@ -1305,15 +1079,15 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
for (const MachineMemOperand *Memop : MI.memoperands()) {
unsigned AS = Memop->getAddrSpace();
- if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
return true;
}
return false;
}
-void SIInsertWaitcnts::updateEventWaitcntAfter(
- MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
+ WaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
@@ -1379,342 +1153,124 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(
}
}
-// Merge the score brackets of the Block's predecessors;
-// this merged score bracket is used when adding waitcnts to the Block
-void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
- BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
- int32_t MaxPending[NUM_INST_CNTS] = {0};
- int32_t MaxFlat[NUM_INST_CNTS] = {0};
- bool MixedExpTypes = false;
-
- // For single basic block loops, we need to retain the Block's
- // score bracket to have accurate Pred info. So, make a copy of Block's
- // score bracket, clear() it (which retains several important bits of info),
- // populate, and then replace en masse. For non-single basic block loops,
- // just clear Block's current score bracket and repopulate in-place.
- bool IsSelfPred;
- std::unique_ptr<BlockWaitcntBrackets> S;
-
- IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
- != Block.pred_end();
- if (IsSelfPred) {
- S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
- ScoreBrackets = S.get();
- }
-
- ScoreBrackets->clear();
-
- // See if there are any uninitialized predecessors. If so, emit an
- // s_waitcnt 0 at the beginning of the block.
- for (MachineBasicBlock *Pred : Block.predecessors()) {
- BlockWaitcntBrackets *PredScoreBrackets =
- BlockWaitcntBracketsMap[Pred].get();
- bool Visited = BlockVisitedSet.count(Pred);
- if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
- continue;
- }
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- int span =
- PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
- MaxPending[T] = std::max(MaxPending[T], span);
- span =
- PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
- MaxFlat[T] = std::max(MaxFlat[T], span);
- }
-
- MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
- }
-
- // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
- // Also handle kills for exit block.
- if (Block.succ_empty() && !KillWaitBrackets.empty()) {
- for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- int Span = KillWaitBrackets[I]->getScoreUB(T) -
- KillWaitBrackets[I]->getScoreLB(T);
- MaxPending[T] = std::max(MaxPending[T], Span);
- Span = KillWaitBrackets[I]->pendingFlat(T) -
- KillWaitBrackets[I]->getScoreLB(T);
- MaxFlat[T] = std::max(MaxFlat[T], Span);
- }
-
- MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
- }
- }
-
- // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
- for (MachineBasicBlock *Pred : Block.predecessors()) {
- BlockWaitcntBrackets *PredScoreBrackets =
- BlockWaitcntBracketsMap[Pred].get();
- bool Visited = BlockVisitedSet.count(Pred);
- if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
- continue;
- }
-
- int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
- PredScoreBrackets->getScoreLB(EXP_CNT);
- MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
- int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
- PredScoreBrackets->getScoreLB(EXP_CNT);
- MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
- }
-
- // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
- if (Block.succ_empty() && !KillWaitBrackets.empty()) {
- for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
- int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
- KillWaitBrackets[I]->getScoreLB(EXP_CNT);
- MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
- int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
- KillWaitBrackets[I]->getScoreLB(EXP_CNT);
- MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
- }
- }
-
-#if 0
- // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
- // TODO: how does LC distinguish between function entry and main entry?
- // If this is the entry to a function, force a wait.
- MachineBasicBlock &Entry = Block.getParent()->front();
- if (Entry.getNumber() == Block.getNumber()) {
- ScoreBrackets->setWaitAtBeginning();
- return;
- }
-#endif
-
- // Now set the current Block's brackets to the largest ending bracket.
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- ScoreBrackets->setScoreUB(T, MaxPending[T]);
- ScoreBrackets->setScoreLB(T, 0);
- ScoreBrackets->setLastFlat(T, MaxFlat[T]);
- }
-
- ScoreBrackets->setMixedExpTypes(MixedExpTypes);
-
- // Set the register scoreboard.
- for (MachineBasicBlock *Pred : Block.predecessors()) {
- if (!BlockVisitedSet.count(Pred)) {
- continue;
- }
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
+ uint32_t OtherScore) {
+ uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+ uint32_t OtherShifted =
+ OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
+ Score = std::max(MyShifted, OtherShifted);
+ return OtherShifted > MyShifted;
+}
- BlockWaitcntBrackets *PredScoreBrackets =
- BlockWaitcntBracketsMap[Pred].get();
+/// Merge the pending events and associater score brackets of \p Other into
+/// this brackets status.
+///
+/// Returns whether the merge resulted in a change that requires tighter waits
+/// (i.e. the merged brackets strictly dominate the original brackets).
+bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
+ bool StrictDom = false;
- // Now merge the gpr_reg_score information
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- int PredLB = PredScoreBrackets->getScoreLB(T);
- int PredUB = PredScoreBrackets->getScoreUB(T);
- if (PredLB < PredUB) {
- int PredScale = MaxPending[T] - PredUB;
- // Merge vgpr scores.
- for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
- int PredRegScore = PredScoreBrackets->getRegScore(J, T);
- if (PredRegScore <= PredLB)
- continue;
- int NewRegScore = PredScale + PredRegScore;
- ScoreBrackets->setRegScore(
- J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
- }
- // Also need to merge sgpr scores for lgkm_cnt.
- if (T == LGKM_CNT) {
- for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
- int PredRegScore =
- PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
- if (PredRegScore <= PredLB)
- continue;
- int NewRegScore = PredScale + PredRegScore;
- ScoreBrackets->setRegScore(
- J + NUM_ALL_VGPRS, LGKM_CNT,
- std::max(
- ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
- NewRegScore));
- }
- }
- }
- }
+ for (auto T : inst_counter_types()) {
+ // Merge event flags for this counter
+ const bool OldOutOfOrder = counterOutOfOrder(T);
+ const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
+ const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+ if (OtherEvents & ~OldEvents)
+ StrictDom = true;
+ if (Other.MixedPendingEvents[T] ||
+ (OldEvents && OtherEvents && OldEvents != OtherEvents))
+ MixedPendingEvents[T] = true;
+ PendingEvents |= OtherEvents;
- // Also merge the WaitEvent information.
- ForAllWaitEventType(W) {
- enum InstCounterType T = PredScoreBrackets->eventCounter(W);
- int PredEventUB = PredScoreBrackets->getEventUB(W);
- if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
- int NewEventUB =
- MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
- if (NewEventUB > 0) {
- ScoreBrackets->setEventUB(
- W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
- }
- }
- }
- }
+ // Merge scores for this counter
+ const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
+ const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+ MergeInfo M;
+ M.OldLB = ScoreLBs[T];
+ M.OtherLB = Other.ScoreLBs[T];
+ M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
+ M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
- // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
- // Set the register scoreboard.
- if (Block.succ_empty() && !KillWaitBrackets.empty()) {
- for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
- // Now merge the gpr_reg_score information.
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- int PredLB = KillWaitBrackets[I]->getScoreLB(T);
- int PredUB = KillWaitBrackets[I]->getScoreUB(T);
- if (PredLB < PredUB) {
- int PredScale = MaxPending[T] - PredUB;
- // Merge vgpr scores.
- for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
- int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
- if (PredRegScore <= PredLB)
- continue;
- int NewRegScore = PredScale + PredRegScore;
- ScoreBrackets->setRegScore(
- J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
- }
- // Also need to merge sgpr scores for lgkm_cnt.
- if (T == LGKM_CNT) {
- for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
- int PredRegScore =
- KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
- if (PredRegScore <= PredLB)
- continue;
- int NewRegScore = PredScale + PredRegScore;
- ScoreBrackets->setRegScore(
- J + NUM_ALL_VGPRS, LGKM_CNT,
- std::max(
- ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
- NewRegScore));
- }
- }
- }
- }
+ const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
+ if (NewUB < ScoreUBs[T])
+ report_fatal_error("waitcnt score overflow");
+ ScoreUBs[T] = NewUB;
+ ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
- // Also merge the WaitEvent information.
- ForAllWaitEventType(W) {
- enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
- int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
- if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
- int NewEventUB =
- MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
- if (NewEventUB > 0) {
- ScoreBrackets->setEventUB(
- W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
- }
- }
- }
- }
- }
+ StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
- // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
- // sequencing predecessors, because changes to EXEC require waitcnts due to
- // the delayed nature of these operations.
- for (MachineBasicBlock *Pred : Block.predecessors()) {
- if (!BlockVisitedSet.count(Pred)) {
- continue;
+ bool RegStrictDom = false;
+ for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
+ J++) {
+ RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
}
- BlockWaitcntBrackets *PredScoreBrackets =
- BlockWaitcntBracketsMap[Pred].get();
-
- int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
- if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
- int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
- PredScoreBrackets->getScoreUB(EXP_CNT);
- if (new_gds_ub > 0) {
- ScoreBrackets->setEventUB(
- GDS_GPR_LOCK,
- std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
- }
- }
- int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
- if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
- int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
- PredScoreBrackets->getScoreUB(EXP_CNT);
- if (new_exp_ub > 0) {
- ScoreBrackets->setEventUB(
- EXP_GPR_LOCK,
- std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+ if (T == LGKM_CNT) {
+ for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
+ J != E; J++) {
+ RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
}
}
- }
- // if a single block loop, update the score brackets. Not needed for other
- // blocks, as we did this in-place
- if (IsSelfPred) {
- BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+ if (RegStrictDom && !OldOutOfOrder)
+ StrictDom = true;
}
-}
-/// Return true if the given basic block is a "bottom" block of a loop.
-/// This works even if the loop is discontiguous. This also handles
-/// multiple back-edges for the same "header" block of a loop.
-bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
- const MachineBasicBlock *Block) {
- for (MachineBasicBlock *MBB : Loop->blocks()) {
- if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
- return true;
- }
- }
- return false;
-}
+ VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
+ SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
-/// Count the number of "bottom" basic blocks of a loop.
-unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
- unsigned Count = 0;
- for (MachineBasicBlock *MBB : Loop->blocks()) {
- if (MBB->isSuccessor(Loop->getHeader())) {
- Count++;
- }
- }
- return Count;
+ return StrictDom;
}
// Generate s_waitcnt instructions where needed.
-void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
- MachineBasicBlock &Block) {
- // Initialize the state information.
- mergeInputScoreBrackets(Block);
-
- BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+ MachineBasicBlock &Block,
+ WaitcntBrackets &ScoreBrackets) {
+ bool Modified = false;
LLVM_DEBUG({
dbgs() << "*** Block" << Block.getNumber() << " ***";
- ScoreBrackets->dump();
+ ScoreBrackets.dump();
});
// Walk over the instructions.
+ MachineInstr *OldWaitcntInstr = nullptr;
+
for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
Iter != E;) {
MachineInstr &Inst = *Iter;
+
// Remove any previously existing waitcnts.
if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
- // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
- // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
- // as needed.
- if (!TrackedWaitcntSet.count(&Inst))
- ++Iter;
- else {
- ++Iter;
- Inst.removeFromParent();
+ if (OldWaitcntInstr) {
+ if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
+ TrackedWaitcntSet.erase(OldWaitcntInstr);
+ OldWaitcntInstr->eraseFromParent();
+ OldWaitcntInstr = nullptr;
+ } else if (!TrackedWaitcntSet.count(&Inst)) {
+ // Two successive s_waitcnt's, both of which are pre-existing and
+ // are therefore preserved.
+ int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
+ ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
+ } else {
+ ++Iter;
+ Inst.eraseFromParent();
+ Modified = true;
+ continue;
+ }
}
- ScoreBrackets->setWaitcnt(&Inst);
- continue;
- }
- // Kill instructions generate a conditional branch to the endmain block.
- // Merge the current waitcnt state into the endmain block information.
- // TODO: Are there other flavors of KILL instruction?
- if (Inst.getOpcode() == AMDGPU::KILL) {
- addKillWaitBracket(ScoreBrackets);
+ OldWaitcntInstr = &Inst;
+ ++Iter;
+ continue;
}
bool VCCZBugWorkAround = false;
if (readsVCCZ(Inst) &&
(!VCCZBugHandledSet.count(&Inst))) {
- if (ScoreBrackets->getScoreLB(LGKM_CNT) <
- ScoreBrackets->getScoreUB(LGKM_CNT) &&
- ScoreBrackets->hasPendingSMEM()) {
+ if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
VCCZBugWorkAround = true;
}
@@ -1722,9 +1278,10 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
- generateWaitcntInstBefore(Inst, ScoreBrackets);
+ Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
+ OldWaitcntInstr = nullptr;
- updateEventWaitcntAfter(Inst, ScoreBrackets);
+ updateEventWaitcntAfter(Inst, &ScoreBrackets);
#if 0 // TODO: implement resource type check controlled by options with ub = LB.
// If this instruction generates a S_SETVSKIP because it is an
@@ -1737,11 +1294,9 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
#endif
- ScoreBrackets->clearWaitcnt();
-
LLVM_DEBUG({
Inst.print(dbgs());
- ScoreBrackets->dump();
+ ScoreBrackets.dump();
});
// Check to see if this is a GWS instruction. If so, and if this is CI or
@@ -1753,10 +1308,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
// TODO: && context->target_info->GwsRequiresMemViolTest() ) {
- ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
- ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
- ScoreBrackets->updateByWait(LGKM_CNT,
- ScoreBrackets->getScoreUB(LGKM_CNT));
+ ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
}
// TODO: Remove this work-around after fixing the scheduler and enable the
@@ -1769,71 +1321,13 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
AMDGPU::VCC)
.addReg(AMDGPU::VCC);
VCCZBugHandledSet.insert(&Inst);
+ Modified = true;
}
++Iter;
}
- // Check if we need to force convergence at loop footer.
- MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
- if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
- LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
- WaitcntData->print();
- LLVM_DEBUG(dbgs() << '\n';);
-
- // The iterative waitcnt insertion algorithm aims for optimal waitcnt
- // placement, but doesn't guarantee convergence for a loop. Each
- // loop should take at most (n+1) iterations for it to converge naturally,
- // where n is the number of bottom blocks. If this threshold is reached and
- // the result hasn't converged, then we force convergence by inserting
- // a s_waitcnt at the end of loop footer.
- if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
- // To ensure convergence, need to make wait events at loop footer be no
- // more than those from the previous iteration.
- // As a simplification, instead of tracking individual scores and
- // generating the precise wait count, just wait on 0.
- bool HasPending = false;
- MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1)) {
- if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
- ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
- HasPending = true;
- break;
- }
- }
-
- if (HasPending) {
- if (!SWaitInst) {
- SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
- DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
- TrackedWaitcntSet.insert(SWaitInst);
-#if 0 // TODO: Format the debug output
- OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
- OutputTransformAdd(SWaitInst, context);
-#endif
- }
-#if 0 // TODO: ??
- _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
-#endif
- }
-
- if (SWaitInst) {
- LLVM_DEBUG({
- SWaitInst->print(dbgs());
- dbgs() << "\nAdjusted score board:";
- ScoreBrackets->dump();
- });
-
- // Add this waitcnt to the block. It is either newly created or
- // created in previous iterations and added back since block traversal
- // always removes waitcnts.
- insertWaitcntBeforeCF(Block, SWaitInst);
- WaitcntData->setWaitcnt(SWaitInst);
- }
- }
- }
+ return Modified;
}
bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
@@ -1841,14 +1335,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
- MLI = &getAnalysis<MachineLoopInfo>();
- IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+ IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- AMDGPUASI = ST->getAMDGPUAS();
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
- for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
- T = (enum InstCounterType)(T + 1))
+ for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
@@ -1868,93 +1359,70 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
TrackedWaitcntSet.clear();
- BlockVisitedSet.clear();
VCCZBugHandledSet.clear();
- LoopWaitcntDataMap.clear();
- BlockWaitcntProcessedSet.clear();
+ RpotIdxMap.clear();
+ BlockInfos.clear();
- // Walk over the blocks in reverse post-dominator order, inserting
- // s_waitcnt where needed.
- ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
- bool Modified = false;
- for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
- I = RPOT.begin(),
- E = RPOT.end(), J = RPOT.begin();
- I != E;) {
- MachineBasicBlock &MBB = **I;
+ // Keep iterating over the blocks in reverse post order, inserting and
+ // updating s_waitcnt where needed, until a fix point is reached.
+ for (MachineBasicBlock *MBB :
+ ReversePostOrderTraversal<MachineFunction *>(&MF)) {
+ RpotIdxMap[MBB] = BlockInfos.size();
+ BlockInfos.emplace_back(MBB);
+ }
- BlockVisitedSet.insert(&MBB);
+ std::unique_ptr<WaitcntBrackets> Brackets;
+ bool Modified = false;
+ bool Repeat;
+ do {
+ Repeat = false;
- BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
- if (!ScoreBrackets) {
- BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
- ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
- }
- ScoreBrackets->setPostOrder(MBB.getNumber());
- MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
- if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
- LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
+ for (BlockInfo &BI : BlockInfos) {
+ if (!BI.Dirty)
+ continue;
- // If we are walking into the block from before the loop, then guarantee
- // at least 1 re-walk over the loop to propagate the information, even if
- // no S_WAITCNT instructions were generated.
- if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
- unsigned Count = countNumBottomBlocks(ContainingLoop);
+ unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
- // If the loop has multiple back-edges, and so more than one "bottom"
- // basic block, we have to guarantee a re-walk over every blocks.
- if ((std::count(BlockWaitcntProcessedSet.begin(),
- BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
- BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
- LLVM_DEBUG(dbgs() << "set-revisit1: Block"
- << ContainingLoop->getHeader()->getNumber() << '\n';);
+ if (BI.Incoming) {
+ if (!Brackets)
+ Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+ else
+ *Brackets = *BI.Incoming;
+ } else {
+ if (!Brackets)
+ Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+ else
+ Brackets->clear();
}
- }
- // Walk over the instructions.
- insertWaitcntInBlock(MF, MBB);
+ Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
+ BI.Dirty = false;
- // Record that waitcnts have been processed at least once for this block.
- BlockWaitcntProcessedSet.push_back(&MBB);
-
- // See if we want to revisit the loop. If a loop has multiple back-edges,
- // we shouldn't revisit the same "bottom" basic block.
- if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
- std::count(BlockWaitcntProcessedSet.begin(),
- BlockWaitcntProcessedSet.end(), &MBB) == 1) {
- MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
- BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
- if (EntrySB && EntrySB->getRevisitLoop()) {
- EntrySB->setRevisitLoop(false);
- J = I;
- int32_t PostOrder = EntrySB->getPostOrder();
- // TODO: Avoid this loop. Find another way to set I.
- for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
- X = RPOT.begin(),
- Y = RPOT.end();
- X != Y; ++X) {
- MachineBasicBlock &MBBX = **X;
- if (MBBX.getNumber() == PostOrder) {
- I = X;
- break;
+ if (Brackets->hasPending()) {
+ BlockInfo *MoveBracketsToSucc = nullptr;
+ for (MachineBasicBlock *Succ : BI.MBB->successors()) {
+ unsigned SuccIdx = RpotIdxMap[Succ];
+ BlockInfo &SuccBI = BlockInfos[SuccIdx];
+ if (!SuccBI.Incoming) {
+ SuccBI.Dirty = true;
+ if (SuccIdx <= Idx)
+ Repeat = true;
+ if (!MoveBracketsToSucc) {
+ MoveBracketsToSucc = &SuccBI;
+ } else {
+ SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+ }
+ } else if (SuccBI.Incoming->merge(*Brackets)) {
+ SuccBI.Dirty = true;
+ if (SuccIdx <= Idx)
+ Repeat = true;
}
}
- LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
- WaitcntData->incIterCnt();
- LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
- continue;
- } else {
- LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
- // Loop converged, reset iteration count. If this loop gets revisited,
- // it must be from an outer loop, the counter will restart, this will
- // ensure we don't force convergence on such revisits.
- WaitcntData->resetIterCnt();
+ if (MoveBracketsToSucc)
+ MoveBracketsToSucc->Incoming = std::move(Brackets);
}
}
-
- J = I;
- ++I;
- }
+ } while (Repeat);
SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index b73d30940fc38..65ffc27b8b608 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -121,6 +121,10 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a D16 buffer instruction.
field bit D16Buf = 0;
+ // This bit indicates that this uses the floating point double precision
+ // rounding mode flags
+ field bit FPDPRounding = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -178,6 +182,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{50} = D16Buf;
+ let TSFlags{51} = FPDPRounding;
+
let SchedRW = [Write32Bit];
field bits<1> DisableSIDecoder = 0;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index f3745382a6f4b..2370d5fa7b27b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -31,6 +31,7 @@
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -264,9 +265,10 @@ static bool isStride64(unsigned Opc) {
}
}
-bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const {
+bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
+ MachineOperand *&BaseOp,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const {
unsigned Opc = LdSt.getOpcode();
if (isDS(LdSt)) {
@@ -274,11 +276,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
getNamedOperand(LdSt, AMDGPU::OpName::offset);
if (OffsetImm) {
// Normal, single offset LDS instruction.
- const MachineOperand *AddrReg =
- getNamedOperand(LdSt, AMDGPU::OpName::addr);
-
- BaseReg = AddrReg->getReg();
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
Offset = OffsetImm->getImm();
+ assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+ "operands of type register.");
return true;
}
@@ -309,10 +310,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
if (isStride64(Opc))
EltSize *= 64;
- const MachineOperand *AddrReg =
- getNamedOperand(LdSt, AMDGPU::OpName::addr);
- BaseReg = AddrReg->getReg();
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
Offset = EltSize * Offset0;
+ assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+ "operands of type register.");
return true;
}
@@ -324,19 +325,20 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
if (SOffset && SOffset->isReg())
return false;
- const MachineOperand *AddrReg =
- getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (!AddrReg)
return false;
const MachineOperand *OffsetImm =
getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseReg = AddrReg->getReg();
+ BaseOp = AddrReg;
Offset = OffsetImm->getImm();
if (SOffset) // soffset can be an inline immediate.
Offset += SOffset->getImm();
+ assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+ "operands of type register.");
return true;
}
@@ -346,36 +348,46 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
if (!OffsetImm)
return false;
- const MachineOperand *SBaseReg =
- getNamedOperand(LdSt, AMDGPU::OpName::sbase);
- BaseReg = SBaseReg->getReg();
+ MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+ BaseOp = SBaseReg;
Offset = OffsetImm->getImm();
+ assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+ "operands of type register.");
return true;
}
if (isFLAT(LdSt)) {
- const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (VAddr) {
// Can't analyze 2 offsets.
if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
return false;
- BaseReg = VAddr->getReg();
+ BaseOp = VAddr;
} else {
// scratch instructions have either vaddr or saddr.
- BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
}
Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
+ assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
+ "operands of type register.");
return true;
}
return false;
}
-static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
- const MachineInstr &MI2, unsigned BaseReg2) {
- if (BaseReg1 == BaseReg2)
+static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
+ const MachineOperand &BaseOp1,
+ const MachineInstr &MI2,
+ const MachineOperand &BaseOp2) {
+ // Support only base operands with base registers.
+ // Note: this could be extended to support FI operands.
+ if (!BaseOp1.isReg() || !BaseOp2.isReg())
+ return false;
+
+ if (BaseOp1.isIdenticalTo(BaseOp2))
return true;
if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
@@ -401,12 +413,13 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
return Base1 == Base2;
}
-bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
- unsigned BaseReg1,
- MachineInstr &SecondLdSt,
- unsigned BaseReg2,
+bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
+ MachineOperand &BaseOp2,
unsigned NumLoads) const {
- if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
+ MachineInstr &FirstLdSt = *BaseOp1.getParent();
+ MachineInstr &SecondLdSt = *BaseOp2.getParent();
+
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
return false;
const MachineOperand *FirstDst = nullptr;
@@ -863,7 +876,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- DebugLoc DL = MBB.findDebugLoc(MI);
+ const DebugLoc &DL = MBB.findDebugLoc(MI);
unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
@@ -907,16 +920,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
return;
}
- if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
- LLVMContext &Ctx = MF->getFunction().getContext();
- Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
- " spill register");
- BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
- .addReg(SrcReg);
-
- return;
- }
-
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
@@ -972,9 +975,9 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
- const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- DebugLoc DL = MBB.findDebugLoc(MI);
+ const DebugLoc &DL = MBB.findDebugLoc(MI);
unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned SpillSize = TRI->getSpillSize(*RC);
@@ -986,6 +989,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
PtrInfo, MachineMemOperand::MOLoad, Size, Align);
if (RI.isSGPRClass(RC)) {
+ MFI->setHasSpilledSGPRs();
+
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
@@ -1009,15 +1014,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
- if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
- LLVMContext &Ctx = MF->getFunction().getContext();
- Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
- " restore register");
- BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
-
- return;
- }
-
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
@@ -1036,7 +1032,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- DebugLoc DL = MBB.findDebugLoc(MI);
+ const DebugLoc &DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@@ -1044,7 +1040,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
if (!MFI->hasCalculatedTID()) {
MachineBasicBlock &Entry = MBB.getParent()->front();
MachineBasicBlock::iterator Insert = Entry.front();
- DebugLoc DL = Insert->getDebugLoc();
+ const DebugLoc &DL = Insert->getDebugLoc();
TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
*MF);
@@ -1421,10 +1417,15 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// TargetInstrInfo::commuteInstruction uses it.
bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
unsigned &SrcOpIdx1) const {
- if (!MI.isCommutable())
+ return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
+}
+
+bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
+ unsigned &SrcOpIdx1) const {
+ if (!Desc.isCommutable())
return false;
- unsigned Opc = MI.getOpcode();
+ unsigned Opc = Desc.getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
return false;
@@ -1549,8 +1550,9 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// buzz;
RS->enterBasicBlockEnd(MBB);
- unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
- MachineBasicBlock::iterator(GetPC), 0);
+ unsigned Scav = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_64RegClass,
+ MachineBasicBlock::iterator(GetPC), false, 0);
MRI.replaceRegWith(PCReg, Scav);
MRI.clearVirtRegs();
RS->setRegUsed(Scav);
@@ -1644,7 +1646,34 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
- if (I == MBB.end())
+ auto E = MBB.end();
+ if (I == E)
+ return false;
+
+ // Skip over the instructions that are artificially terminators for special
+ // exec management.
+ while (I != E && !I->isBranch() && !I->isReturn() &&
+ I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
+ switch (I->getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH:
+ case AMDGPU::S_MOV_B64_term:
+ case AMDGPU::S_XOR_B64_term:
+ case AMDGPU::S_ANDN2_B64_term:
+ break;
+ case AMDGPU::SI_IF:
+ case AMDGPU::SI_ELSE:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ // FIXME: It's messy that these need to be considered here at all.
+ return true;
+ default:
+ llvm_unreachable("unexpected non-branch terminator inst");
+ }
+
+ ++I;
+ }
+
+ if (I == E)
return false;
if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
@@ -1933,20 +1962,20 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
}
unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
- PseudoSourceValue::PSVKind Kind) const {
+ unsigned Kind) const {
switch(Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return ST.getAMDGPUAS().PRIVATE_ADDRESS;
+ return AMDGPUAS::PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return ST.getAMDGPUAS().CONSTANT_ADDRESS;
+ return AMDGPUAS::CONSTANT_ADDRESS;
}
- return ST.getAMDGPUAS().FLAT_ADDRESS;
+ return AMDGPUAS::FLAT_ADDRESS;
}
static void removeModOperands(MachineInstr &MI) {
@@ -2066,12 +2095,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (Src2->isReg() && Src2->getReg() == Reg) {
// Not allowed to use constant bus for another operand.
// We can however allow an inline immediate as src0.
- if (!Src0->isImm() &&
- (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
- return false;
+ bool Src0Inlined = false;
+ if (Src0->isReg()) {
+ // Try to inline constant if possible.
+ // If the Def moves immediate and the use is single
+ // We are saving VGPR here.
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src0->getReg())) {
+ Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+ Src0Inlined = true;
+ } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
+ (RI.isVirtualRegister(Src0->getReg()) &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+ return false;
+ // VGPR is okay as Src0 - fallthrough
+ }
- if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
- return false;
+ if (Src1->isReg() && !Src0Inlined ) {
+ // We have one slot for inlinable constant so far - try to fill it
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src1->getReg()) &&
+ commuteInstruction(UseMI)) {
+ Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+ } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
+ (RI.isVirtualRegister(Src1->getReg()) &&
+ RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ return false;
+ // VGPR is okay as Src1 - fallthrough
+ }
const int64_t Imm = ImmOp->getImm();
@@ -2117,11 +2174,13 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
MachineInstr &MIb) const {
- unsigned BaseReg0, BaseReg1;
+ MachineOperand *BaseOp0, *BaseOp1;
int64_t Offset0, Offset1;
- if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
- getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+ if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
+ getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
+ if (!BaseOp0->isIdenticalTo(*BaseOp1))
+ return false;
if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
// FIXME: Handle ds_read2 / ds_write2.
@@ -2129,8 +2188,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
}
unsigned Width0 = (*MIa.memoperands_begin())->getSize();
unsigned Width1 = (*MIb.memoperands_begin())->getSize();
- if (BaseReg0 == BaseReg1 &&
- offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+ if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
return true;
}
}
@@ -2398,8 +2456,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);
- return Trunc == Imm &&
- AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
+ return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -2523,6 +2580,115 @@ bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
hasModifiersSet(MI, AMDGPU::OpName::omod);
}
+bool SIInstrInfo::canShrink(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI) const {
+ const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+ // Can't shrink instruction with three operands.
+ // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
+ // a special case for it. It can only be shrunk if the third operand
+ // is vcc. We should handle this the same way we handle vopc, by addding
+ // a register allocation hint pre-regalloc and then do the shrinking
+ // post-regalloc.
+ if (Src2) {
+ switch (MI.getOpcode()) {
+ default: return false;
+
+ case AMDGPU::V_ADDC_U32_e64:
+ case AMDGPU::V_SUBB_U32_e64:
+ case AMDGPU::V_SUBBREV_U32_e64: {
+ const MachineOperand *Src1
+ = getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
+ return false;
+ // Additional verification is needed for sdst/src2.
+ return true;
+ }
+ case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F32_e64:
+ if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
+ hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
+ return false;
+ break;
+
+ case AMDGPU::V_CNDMASK_B32_e64:
+ break;
+ }
+ }
+
+ const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
+ hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
+ return false;
+
+ // We don't need to check src0, all input types are legal, so just make sure
+ // src0 isn't using any modifiers.
+ if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
+ return false;
+
+ // Can it be shrunk to a valid 32 bit opcode?
+ if (!hasVALU32BitEncoding(MI.getOpcode()))
+ return false;
+
+ // Check output modifiers
+ return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
+ !hasModifiersSet(MI, AMDGPU::OpName::clamp);
+}
+
+// Set VCC operand with all flags from \p Orig, except for setting it as
+// implicit.
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+ const MachineOperand &Orig) {
+
+ for (MachineOperand &Use : MI.implicit_operands()) {
+ if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
+ Use.setIsUndef(Orig.isUndef());
+ Use.setIsKill(Orig.isKill());
+ return;
+ }
+ }
+}
+
+MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
+ unsigned Op32) const {
+ MachineBasicBlock *MBB = MI.getParent();;
+ MachineInstrBuilder Inst32 =
+ BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
+
+ // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
+ // For VOPC instructions, this is replaced by an implicit def of vcc.
+ int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
+ if (Op32DstIdx != -1) {
+ // dst
+ Inst32.add(MI.getOperand(0));
+ } else {
+ assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+ "Unexpected case");
+ }
+
+ Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
+
+ const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (Src1)
+ Inst32.add(*Src1);
+
+ const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+
+ if (Src2) {
+ int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+ if (Op32Src2Idx != -1) {
+ Inst32.add(*Src2);
+ } else {
+ // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+ // replaced with an implicit read of vcc. This was already added
+ // during the initial BuildMI, so find it to preserve the flags.
+ copyFlagsToImplicitVCC(*Inst32, *Src2);
+ }
+ }
+
+ return Inst32;
+}
+
bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
const MachineOperand &MO,
const MCOperandInfo &OpInfo) const {
@@ -2806,6 +2972,42 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // Verify MIMG
+ if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
+ // Ensure that the return type used is large enough for all the options
+ // being used TFE/LWE require an extra result register.
+ const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
+ if (DMask) {
+ uint64_t DMaskImm = DMask->getImm();
+ uint32_t RegCount =
+ isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
+ const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
+ const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
+ const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
+
+ // Adjust for packed 16 bit values
+ if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
+ RegCount >>= 1;
+
+ // Adjust if using LWE or TFE
+ if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
+ RegCount += 1;
+
+ const uint32_t DstIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+ const MachineOperand &Dst = MI.getOperand(DstIdx);
+ if (Dst.isReg()) {
+ const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
+ uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
+ if (RegCount > DstSize) {
+ ErrInfo = "MIMG instruction returns too many registers for dst "
+ "register class";
+ return false;
+ }
+ }
+ }
+ }
+
// Verify VOP*. Ignore multiple sgpr operands on writelane.
if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
&& (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
@@ -3001,6 +3203,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+ case AMDGPU::S_XNOR_B32:
+ return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
@@ -3438,8 +3642,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
// pointer value is uniform.
MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
- unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
- SBase->setReg(SGPR);
+ unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ SBase->setReg(SGPR);
+ }
+ MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
+ if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
+ SOff->setReg(SGPR);
}
}
@@ -3475,7 +3684,191 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
FoldImmediate(*Copy, *Def, OpReg, &MRI);
}
-void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+// Emit the actual waterfall loop, executing the wrapped instruction for each
+// unique value of \p Rsrc across all lanes. In the best case we execute 1
+// iteration, in the worst case we execute 64 (once per lane).
+static void
+emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
+ const DebugLoc &DL, MachineOperand &Rsrc) {
+ MachineBasicBlock::iterator I = LoopBB.begin();
+
+ unsigned VRsrc = Rsrc.getReg();
+ unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
+
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+
+ // Beginning of the loop, read the next Rsrc variant.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
+ .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
+
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
+ .addReg(SRsrcSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(SRsrcSub1)
+ .addImm(AMDGPU::sub1)
+ .addReg(SRsrcSub2)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcSub3)
+ .addImm(AMDGPU::sub3);
+
+ // Update Rsrc operand to use the SGPR Rsrc.
+ Rsrc.setReg(SRsrc);
+ Rsrc.setIsKill(true);
+
+ // Identify all lanes with identical Rsrc operands in their VGPRs.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
+ .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
+ .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
+ .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
+ .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
+ .addReg(CondReg0)
+ .addReg(CondReg1);
+
+ MRI.setSimpleHint(SaveExec, AndCond);
+
+ // Update EXEC to matching lanes, saving original to SaveExec.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
+ .addReg(AndCond, RegState::Kill);
+
+ // The original instruction is here; we insert the terminators after it.
+ I = LoopBB.end();
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(SaveExec);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
+}
+
+// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
+// with SGPRs by iterating over all unique values across all lanes.
+static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
+ MachineOperand &Rsrc, MachineDominatorTree *MDT) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ // Save the EXEC mask
+ BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
+ .addReg(AMDGPU::EXEC);
+
+ // Killed uses in the instruction we are waterfalling around will be
+ // incorrect due to the added control-flow.
+ for (auto &MO : MI.uses()) {
+ if (MO.isReg() && MO.isUse()) {
+ MRI.clearKillFlags(MO.getReg());
+ }
+ }
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
+ MachineBasicBlock::iterator J = I++;
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ LoopBB->splice(LoopBB->begin(), &MBB, J);
+
+ MBB.addSuccessor(LoopBB);
+
+ // Update dominators. We know that MBB immediately dominates LoopBB, that
+ // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
+ // dominates all of the successors transferred to it from MBB that MBB used
+ // to dominate.
+ if (MDT) {
+ MDT->addNewBlock(LoopBB, &MBB);
+ MDT->addNewBlock(RemainderBB, LoopBB);
+ for (auto &Succ : RemainderBB->successors()) {
+ if (MDT->dominates(&MBB, Succ)) {
+ MDT->changeImmediateDominator(Succ, RemainderBB);
+ }
+ }
+ }
+
+ emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
+
+ // Restore the EXEC mask
+ MachineBasicBlock::iterator First = RemainderBB->begin();
+ BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(SaveExec);
+}
+
+// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
+static std::tuple<unsigned, unsigned>
+extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Extract the ptr from the resource descriptor.
+ unsigned RsrcPtr =
+ TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
+ AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
+
+ // Create an empty resource descriptor
+ unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
+
+ // Zero64 = 0
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
+ .addImm(0);
+
+ // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+ // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+ .addImm(RsrcDataFormat >> 32);
+
+ // NewSRsrc = {Zero64, SRsrcFormat}
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+ .addReg(Zero64)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SRsrcFormatLo)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcFormatHi)
+ .addImm(AMDGPU::sub3);
+
+ return std::make_tuple(RsrcPtr, NewSRsrc);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI,
+ MachineDominatorTree *MDT) const {
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3617,75 +4010,56 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
return;
}
- // Legalize MUBUF* instructions by converting to addr64 form.
- // FIXME: If we start using the non-addr64 instructions for compute, we
- // may need to legalize them as above. This especially applies to the
- // buffer_load_format_* variants and variants with idxen (or bothen).
- int SRsrcIdx =
+ // Legalize MUBUF* instructions.
+ int RsrcIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
- if (SRsrcIdx != -1) {
+ if (RsrcIdx != -1) {
// We have an MUBUF instruction
- MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
- unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
- if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
- RI.getRegClass(SRsrcRC))) {
+ MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
+ unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
+ if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
+ RI.getRegClass(RsrcRC))) {
// The operands are legal.
// FIXME: We may need to legalize operands besided srsrc.
return;
}
- MachineBasicBlock &MBB = *MI.getParent();
-
- // Extract the ptr from the resource descriptor.
- unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
-
- // Create an empty resource descriptor
- unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
- uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
- // Zero64 = 0
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
- .addImm(0);
+ // Legalize a VGPR Rsrc.
+ //
+ // If the instruction is _ADDR64, we can avoid a waterfall by extracting
+ // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
+ // a zero-value SRsrc.
+ //
+ // If the instruction is _OFFSET (both idxen and offen disabled), and we
+ // support ADDR64 instructions, we can convert to ADDR64 and do the same as
+ // above.
+ //
+ // Otherwise we are on non-ADDR64 hardware, and/or we have
+ // idxen/offen/bothen and we fall back to a waterfall loop.
- // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
- .addImm(RsrcDataFormat & 0xFFFFFFFF);
-
- // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
- .addImm(RsrcDataFormat >> 32);
-
- // NewSRsrc = {Zero64, SRsrcFormat}
- BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
- .addReg(Zero64)
- .addImm(AMDGPU::sub0_sub1)
- .addReg(SRsrcFormatLo)
- .addImm(AMDGPU::sub2)
- .addReg(SRsrcFormatHi)
- .addImm(AMDGPU::sub3);
+ MachineBasicBlock &MBB = *MI.getParent();
MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- if (VAddr) {
+ if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+ unsigned RsrcPtr, NewSRsrc;
+ std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
- // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+ // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
DebugLoc DL = MI.getDebugLoc();
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
- .addReg(SRsrcPtr, 0, AMDGPU::sub0)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+ .addReg(RsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
- // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+ // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
- .addReg(SRsrcPtr, 0, AMDGPU::sub1)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+ .addReg(RsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
@@ -3693,13 +4067,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
.addImm(AMDGPU::sub0)
.addReg(NewVAddrHi)
.addImm(AMDGPU::sub1);
- } else {
+
+ VAddr->setReg(NewVAddr);
+ Rsrc->setReg(NewSRsrc);
+ } else if (!VAddr && ST.hasAddr64()) {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
< AMDGPUSubtarget::VOLCANIC_ISLANDS &&
"FIXME: Need to emit flat atomics here");
+ unsigned RsrcPtr, NewSRsrc;
+ std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
+
+ unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
@@ -3715,10 +4096,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
MachineInstrBuilder MIB =
BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
.add(*VData)
- .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
- // This will be replaced later
- // with the new value of vaddr.
- .add(*SRsrc)
+ .addReg(NewVAddr)
+ .addReg(NewSRsrc)
.add(*SOffset)
.add(*Offset);
@@ -3735,21 +4114,19 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
MIB.addImm(TFE->getImm());
}
- MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ MIB.cloneMemRefs(MI);
Addr64 = MIB;
} else {
// Atomics with return.
Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
.add(*VData)
.add(*VDataIn)
- .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
- // This will be replaced later
- // with the new value of vaddr.
- .add(*SRsrc)
+ .addReg(NewVAddr)
+ .addReg(NewSRsrc)
.add(*SOffset)
.add(*Offset)
.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
- .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+ .cloneMemRefs(MI);
}
MI.removeFromParent();
@@ -3757,23 +4134,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
// NewVaddr = {NewVaddrHi, NewVaddrLo}
BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
NewVAddr)
- .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addReg(RsrcPtr, 0, AMDGPU::sub0)
.addImm(AMDGPU::sub0)
- .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addReg(RsrcPtr, 0, AMDGPU::sub1)
.addImm(AMDGPU::sub1);
-
- VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
- SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
+ } else {
+ // This is another variant; legalize Rsrc with waterfall loop from VGPRs
+ // to SGPRs.
+ loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
}
-
- // Update the instruction to use NewVaddr
- VAddr->setReg(NewVAddr);
- // Update the instruction to use NewSRsrc
- SRsrc->setReg(NewSRsrc);
}
}
-void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
+ MachineDominatorTree *MDT) const {
SetVectorType Worklist;
Worklist.insert(&TopInst);
@@ -3791,34 +4165,62 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
break;
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO:
- splitScalar64BitAddSub(Worklist, Inst);
+ splitScalar64BitAddSub(Worklist, Inst, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32:
// FIXME: The u32 versions currently selected use the carry.
- if (moveScalarAddSub(Worklist, Inst))
+ if (moveScalarAddSub(Worklist, Inst, MDT))
continue;
// Default handling
break;
case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NAND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_XNOR_B64:
+ if (ST.hasDLInsts())
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ else
+ splitScalar64BitXnor(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ANDN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ORN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
Inst.eraseFromParent();
continue;
@@ -3899,90 +4301,31 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
Inst.eraseFromParent();
continue;
- case AMDGPU::S_XNOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32);
+ case AMDGPU::S_NAND_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
Inst.eraseFromParent();
continue;
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
- unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
- auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
- unsigned Offset = 0;
-
- // FIXME: This isn't safe because the addressing mode doesn't work
- // correctly if vaddr is negative.
- //
- // FIXME: Should probably be done somewhere else, maybe SIFoldOperands.
- //
- // See if we can extract an immediate offset by recognizing one of these:
- // V_ADD_I32_e32 dst, imm, src1
- // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1
- // V_ADD will be removed by "Remove dead machine instructions".
- if (Add &&
- (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 ||
- Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) {
- static const unsigned SrcNames[2] = {
- AMDGPU::OpName::src0,
- AMDGPU::OpName::src1,
- };
-
- // Find a literal offset in one of source operands.
- for (int i = 0; i < 2; i++) {
- const MachineOperand *Src =
- getNamedOperand(*Add, SrcNames[i]);
-
- if (Src->isReg()) {
- auto Mov = MRI.getUniqueVRegDef(Src->getReg());
- if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32)
- Src = &Mov->getOperand(1);
- }
-
- if (Src) {
- if (Src->isImm())
- Offset = Src->getImm();
- else if (Src->isCImm())
- Offset = Src->getCImm()->getZExtValue();
- }
-
- if (Offset && isLegalMUBUFImmOffset(Offset)) {
- VAddr = getNamedOperand(*Add, SrcNames[!i]);
- break;
- }
-
- Offset = 0;
- }
- }
-
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(),
- get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
- .add(*VAddr) // vaddr
- .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
- .addImm(0) // soffset
- .addImm(Offset) // offset
- .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
- .addImm(0) // slc
- .addImm(0) // tfe
- .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
- .getInstr();
+ case AMDGPU::S_NOR_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ continue;
- MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
- VDst);
- addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
+ case AMDGPU::S_ANDN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
Inst.eraseFromParent();
+ continue;
- // Legalize all operands other than the offset. Notably, convert the srsrc
- // into SGPRs using v_readfirstlane if needed.
- legalizeOperands(*NewInstr);
+ case AMDGPU::S_ORN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
continue;
}
- }
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
// legalize its operands instead.
- legalizeOperands(Inst);
+ legalizeOperands(Inst, MDT);
continue;
}
@@ -4071,7 +4414,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
// Legalize the operands
- legalizeOperands(Inst);
+ legalizeOperands(Inst, MDT);
if (HasDst)
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
@@ -4079,8 +4422,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
// Add/sub require special handling to deal with carry outs.
-bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
- MachineInstr &Inst) const {
+bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
if (ST.hasAddNoCarry()) {
// Assume there is no user of scc since we don't select this in that case.
// Since scc isn't used, it doesn't really matter if the i32 or u32 variant
@@ -4104,7 +4447,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist,
Inst.setDesc(get(NewOpc));
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
- legalizeOperands(Inst);
+ legalizeOperands(Inst, MDT);
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
return true;
@@ -4151,23 +4494,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
- legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
-
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.hasDLInsts()) {
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
+
BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
.add(Src0)
.add(Src1);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
} else {
- unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
- .add(Src0)
+ // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
+ // invert either source and then perform the XOR. If either source is a
+ // scalar register, then we can leave the inversion on the scalar unit to
+ // acheive a better distrubution of scalar and vector instructions.
+ bool Src0IsSGPR = Src0.isReg() &&
+ RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
+ bool Src1IsSGPR = Src1.isReg() &&
+ RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
+ MachineInstr *Not = nullptr;
+ MachineInstr *Xor = nullptr;
+ unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // Build a pair of scalar instructions and add them to the work list.
+ // The next iteration over the work list will lower these to the vector
+ // unit as necessary.
+ if (Src0IsSGPR) {
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+ .add(Src0);
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+ .addReg(Temp)
.add(Src1);
+ } else if (Src1IsSGPR) {
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+ .add(Src1);
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+ .add(Src0)
+ .addReg(Temp);
+ } else {
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
+ .add(Src0)
+ .add(Src1);
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+ .addReg(Temp);
+ Worklist.insert(Not);
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
- .addReg(Xor);
+ Worklist.insert(Xor);
+
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
+}
+
+void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
+ .add(Src0)
+ .add(Src1);
+
+ MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+ .addReg(Interm);
+
+ Worklist.insert(&Op);
+ Worklist.insert(&Not);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
+ .add(Src1);
+
+ MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
+ .add(Src0)
+ .addReg(Interm);
+
+ Worklist.insert(&Not);
+ Worklist.insert(&Op);
MRI.replaceRegWith(Dest.getReg(), NewDest);
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
@@ -4200,13 +4636,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
- BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
+ MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
- BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+ MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -4217,6 +4653,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ Worklist.insert(&LoHalf);
+ Worklist.insert(&HiHalf);
+
// We don't need to legalizeOperands here because for a single operand, src0
// will support any kind of input.
@@ -4224,8 +4663,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitAddSub(
- SetVectorType &Worklist, MachineInstr &Inst) const {
+void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
MachineBasicBlock &MBB = *Inst.getParent();
@@ -4285,16 +4725,16 @@ void SIInstrInfo::splitScalar64BitAddSub(
// Try to legalize the operands in case we need to swap the order to keep it
// valid.
- legalizeOperands(*LoHalf);
- legalizeOperands(*HiHalf);
+ legalizeOperands(*LoHalf, MDT);
+ legalizeOperands(*HiHalf, MDT);
// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBinaryOp(
- SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode) const {
+void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
+ MachineInstr &Inst, unsigned Opcode,
+ MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -4321,6 +4761,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
AMDGPU::sub0, Src0SubRC);
MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
AMDGPU::sub0, Src1SubRC);
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub1, Src1SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
@@ -4331,11 +4775,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
.add(SrcReg0Sub0)
.add(SrcReg1Sub0);
- MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
- AMDGPU::sub1, Src0SubRC);
- MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
- AMDGPU::sub1, Src1SubRC);
-
unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.add(SrcReg0Sub1)
@@ -4350,22 +4789,62 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
- // Try to legalize the operands in case we need to swap the order to keep it
- // valid.
- legalizeOperands(LoHalf);
- legalizeOperands(HiHalf);
+ Worklist.insert(&LoHalf);
+ Worklist.insert(&HiHalf);
// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineBasicBlock::iterator MII = Inst;
+
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ MachineOperand* Op0;
+ MachineOperand* Op1;
+
+ if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
+ Op0 = &Src0;
+ Op1 = &Src1;
+ } else {
+ Op0 = &Src1;
+ Op1 = &Src0;
+ }
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
+ .add(*Op0);
+
+ unsigned NewDest = MRI.createVirtualRegister(DestRC);
+
+ MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
+ .addReg(Interm)
+ .add(*Op1);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+
+ Worklist.insert(&Xor);
+}
+
void SIInstrInfo::splitScalar64BitBCNT(
SetVectorType &Worklist, MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineBasicBlock::iterator MII = Inst;
- DebugLoc DL = Inst.getDebugLoc();
+ const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src = Inst.getOperand(1);
@@ -4401,7 +4880,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineBasicBlock::iterator MII = Inst;
- DebugLoc DL = Inst.getDebugLoc();
+ const DebugLoc &DL = Inst.getDebugLoc();
MachineOperand &Dest = Inst.getOperand(0);
uint32_t Imm = Inst.getOperand(2).getImm();
@@ -4546,10 +5025,10 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
make_range(MachineBasicBlock::iterator(SCCDefInst),
SCCDefInst.getParent()->end())) {
// Exit if we find another SCC def.
- if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+ if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
return;
- if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
Worklist.insert(&MI);
}
}
@@ -4716,7 +5195,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
return AMDGPU::NoRegister;
assert(!MI.memoperands_empty() &&
- (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
+ (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
FrameIndex = Addr->getIndex();
return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4777,12 +5256,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
// If we have a definitive size, we can use it. Otherwise we need to inspect
// the operands to know the size.
- //
- // FIXME: Instructions that have a base 32-bit encoding report their size as
- // 4, even though they are really 8 bytes if they have a literal operand.
- if (DescSize != 0 && DescSize != 4)
- return DescSize;
-
if (isFixedSize(MI))
return DescSize;
@@ -4791,23 +5264,27 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
if (isVALU(MI) || isSALU(MI)) {
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
- return 4; // No operands.
+ return DescSize; // No operands.
if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
- return 8;
+ return DescSize + 4;
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
- return 4;
+ return DescSize;
if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
- return 8;
+ return DescSize + 4;
- return 4;
- }
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx == -1)
+ return DescSize;
- if (DescSize == 4)
- return 4;
+ if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
+ return DescSize + 4;
+
+ return DescSize;
+ }
switch (Opc) {
case TargetOpcode::IMPLICIT_DEF:
@@ -4823,7 +5300,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
}
default:
- llvm_unreachable("unable to find instruction size");
+ return DescSize;
}
}
@@ -4835,7 +5312,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
return true;
for (const MachineMemOperand *MMO : MI.memoperands()) {
- if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
+ if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
return true;
}
return false;
@@ -5069,3 +5546,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
return MCOp;
}
+
+static
+TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
+ assert(RegOpnd.isReg());
+ return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
+ getRegSubRegPair(RegOpnd);
+}
+
+TargetInstrInfo::RegSubRegPair
+llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
+ assert(MI.isRegSequence());
+ for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
+ if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
+ auto &RegOp = MI.getOperand(1 + 2 * I);
+ return getRegOrUndef(RegOp);
+ }
+ return TargetInstrInfo::RegSubRegPair();
+}
+
+// Try to find the definition of reg:subreg in subreg-manipulation pseudos
+// Following a subreg of reg:subreg isn't supported
+static bool followSubRegDef(MachineInstr &MI,
+ TargetInstrInfo::RegSubRegPair &RSR) {
+ if (!RSR.SubReg)
+ return false;
+ switch (MI.getOpcode()) {
+ default: break;
+ case AMDGPU::REG_SEQUENCE:
+ RSR = getRegSequenceSubReg(MI, RSR.SubReg);
+ return true;
+ // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
+ case AMDGPU::INSERT_SUBREG:
+ if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
+ // inserted the subreg we're looking for
+ RSR = getRegOrUndef(MI.getOperand(2));
+ else { // the subreg in the rest of the reg
+ auto R1 = getRegOrUndef(MI.getOperand(1));
+ if (R1.SubReg) // subreg of subreg isn't supported
+ return false;
+ RSR.Reg = R1.Reg;
+ }
+ return true;
+ }
+ return false;
+}
+
+MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
+ MachineRegisterInfo &MRI) {
+ assert(MRI.isSSA());
+ if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
+ return nullptr;
+
+ auto RSR = P;
+ auto *DefInst = MRI.getVRegDef(RSR.Reg);
+ while (auto *MI = DefInst) {
+ DefInst = nullptr;
+ switch (MI->getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::V_MOV_B32_e32: {
+ auto &Op1 = MI->getOperand(1);
+ if (Op1.isReg() &&
+ TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
+ if (Op1.isUndef())
+ return nullptr;
+ RSR = getRegSubRegPair(Op1);
+ DefInst = MRI.getVRegDef(RSR.Reg);
+ }
+ break;
+ }
+ default:
+ if (followSubRegDef(*MI, RSR)) {
+ if (!RSR.Reg)
+ return nullptr;
+ DefInst = MRI.getVRegDef(RSR.Reg);
+ }
+ }
+ if (!DefInst)
+ return MI;
+ }
+ return nullptr;
+}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index d681b926504ed..5b1a05f3785ec 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -37,6 +37,7 @@
namespace llvm {
class APInt;
+class MachineDominatorTree;
class MachineRegisterInfo;
class RegScavenger;
class GCNSubtarget;
@@ -79,8 +80,8 @@ public:
private:
void swapOperands(MachineInstr &Inst) const;
- bool moveScalarAddSub(SetVectorType &Worklist,
- MachineInstr &Inst) const;
+ bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
void lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const;
@@ -88,14 +89,26 @@ private:
void lowerScalarXnor(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void splitScalarNotBinop(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const;
+
+ void splitScalarBinOpN2(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const;
+
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
- void splitScalar64BitAddSub(SetVectorType &Worklist,
- MachineInstr &Inst) const;
+ void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
+
+ void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst,
+ unsigned Opcode,
+ MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitBinaryOp(SetVectorType &Worklist,
- MachineInstr &Inst, unsigned Opcode) const;
+ void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
void splitScalar64BitBCNT(SetVectorType &Worklist,
MachineInstr &Inst) const;
@@ -160,12 +173,11 @@ public:
int64_t &Offset1,
int64_t &Offset2) const override;
- bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const final;
+ bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp,
+ int64_t &Offset,
+ const TargetRegisterInfo *TRI) const final;
- bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
- MachineInstr &SecondLdSt, unsigned BaseReg2,
+ bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2,
unsigned NumLoads) const override;
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
@@ -225,6 +237,9 @@ public:
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
+ bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0,
+ unsigned & SrcOpIdx1) const;
+
bool isBranchOffsetInRange(unsigned BranchOpc,
int64_t BrOffset) const override;
@@ -276,7 +291,7 @@ public:
unsigned TrueReg, unsigned FalseReg) const;
unsigned getAddressSpaceForPseudoSourceKind(
- PseudoSourceValue::PSVKind Kind) const override;
+ unsigned Kind) const override;
bool
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
@@ -589,6 +604,14 @@ public:
return MI.getDesc().TSFlags & ClampFlags;
}
+ static bool usesFPDPRounding(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
+ }
+
+ bool usesFPDPRounding(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();
@@ -689,6 +712,12 @@ public:
unsigned OpName) const;
bool hasAnyModifiersSet(const MachineInstr &MI) const;
+ bool canShrink(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI) const;
+
+ MachineInstr *buildShrunkInst(MachineInstr &MI,
+ unsigned NewOpcode) const;
+
bool verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const override;
@@ -719,6 +748,16 @@ public:
/// This form should usually be preferred since it handles operands
/// with unknown register classes.
unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+ const MachineOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg()) {
+ if (unsigned SubReg = MO.getSubReg()) {
+ assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg(
+ MI.getParent()->getParent()->getRegInfo().
+ getRegClass(MO.getReg()), SubReg)) >= 32 &&
+ "Sub-dword subregs are not supported");
+ return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4;
+ }
+ }
return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
}
@@ -777,14 +816,16 @@ public:
MachineOperand &Op, MachineRegisterInfo &MRI,
const DebugLoc &DL) const;
- /// Legalize all operands in this instruction. This function may
- /// create new instruction and insert them before \p MI.
- void legalizeOperands(MachineInstr &MI) const;
+ /// Legalize all operands in this instruction. This function may create new
+ /// instructions and control-flow around \p MI. If present, \p MDT is
+ /// updated.
+ void legalizeOperands(MachineInstr &MI,
+ MachineDominatorTree *MDT = nullptr) const;
/// Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
- /// VALU if necessary.
- void moveToVALU(MachineInstr &MI) const;
+ /// VALU if necessary. If present, \p MDT is updated.
+ void moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const;
void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
int Count) const;
@@ -885,9 +926,36 @@ public:
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
int pseudoToMCOpcode(int Opcode) const;
-
};
+/// \brief Returns true if a reg:subreg pair P has a TRC class
+inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P,
+ const TargetRegisterClass &TRC,
+ MachineRegisterInfo &MRI) {
+ auto *RC = MRI.getRegClass(P.Reg);
+ if (!P.SubReg)
+ return RC == &TRC;
+ auto *TRI = MRI.getTargetRegisterInfo();
+ return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg);
+}
+
+/// \brief Create RegSubRegPair from a register MachineOperand
+inline
+TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) {
+ assert(O.isReg());
+ return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg());
+}
+
+/// \brief Return the SubReg component from REG_SEQUENCE
+TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
+ unsigned SubReg);
+
+/// \brief Return the defining instruction for a given reg:subreg pair
+/// skipping copy like instructions and subreg-manipulation pseudos.
+/// Following another subreg of a reg:subreg isn't supported.
+MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
+ MachineRegisterInfo &MRI);
+
namespace AMDGPU {
LLVM_READONLY
@@ -900,6 +968,9 @@ namespace AMDGPU {
int getSDWAOp(uint16_t Opcode);
LLVM_READONLY
+ int getDPPOp32(uint16_t Opcode);
+
+ LLVM_READONLY
int getBasicFromSDWAOp(uint16_t Opcode);
LLVM_READONLY
@@ -911,6 +982,12 @@ namespace AMDGPU {
LLVM_READONLY
int getAddr64Inst(uint16_t Opcode);
+ /// Check if \p Opcode is an Addr64 opcode.
+ ///
+ /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1.
+ LLVM_READONLY
+ int getIfAddr64Inst(uint16_t Opcode);
+
LLVM_READONLY
int getMUBUFNoLdsInst(uint16_t Opcode);
@@ -923,6 +1000,9 @@ namespace AMDGPU {
LLVM_READONLY
int getSOPKOp(uint16_t Opcode);
+ LLVM_READONLY
+ int getGlobalSaddrOp(uint16_t Opcode);
+
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 8fa37aa83daed..13afa4d4974bf 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -40,9 +40,9 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
- SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
- [SDNPMayLoad, SDNPMemOperand]
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
+ SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+ [SDNPMayLoad, SDNPMemOperand]
>;
def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
@@ -69,36 +69,34 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
-def SDTbuffer_load : SDTypeProfile<1, 9,
+def SDTtbuffer_load : SDTypeProfile<1, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
SDTCisVT<2, i32>, // vindex(VGPR)
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // dfmt(imm)
- SDTCisVT<7, i32>, // nfmt(imm)
- SDTCisVT<8, i32>, // glc(imm)
- SDTCisVT<9, i32> // slc(imm)
+ SDTCisVT<6, i32>, // format(imm)
+ SDTCisVT<7, i32>, // cachecontrol(imm)
+ SDTCisVT<8, i1> // idxen(imm)
]>;
-def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load,
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTtbuffer_load,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16",
- SDTbuffer_load,
+ SDTtbuffer_load,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>;
-def SDTtbuffer_store : SDTypeProfile<0, 10,
+def SDTtbuffer_store : SDTypeProfile<0, 9,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
SDTCisVT<2, i32>, // vindex(VGPR)
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // dfmt(imm)
- SDTCisVT<7, i32>, // nfmt(imm)
- SDTCisVT<8, i32>, // glc(imm)
- SDTCisVT<9, i32> // slc(imm)
+ SDTCisVT<6, i32>, // format(imm)
+ SDTCisVT<7, i32>, // cachecontrol(imm)
+ SDTCisVT<8, i1> // idxen(imm)
]>;
def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
@@ -110,13 +108,15 @@ def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16",
SDTtbuffer_store,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
-def SDTBufferLoad : SDTypeProfile<1, 5,
+def SDTBufferLoad : SDTypeProfile<1, 7,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
- SDTCisVT<2, i32>, // vindex
- SDTCisVT<3, i32>, // offset
- SDTCisVT<4, i1>, // glc
- SDTCisVT<5, i1>]>; // slc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<7, i1>]>; // idxen(imm)
def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
@@ -126,13 +126,15 @@ def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16",
SDTBufferLoad,
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
-def SDTBufferStore : SDTypeProfile<0, 6,
+def SDTBufferStore : SDTypeProfile<0, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
- SDTCisVT<2, i32>, // vindex
- SDTCisVT<3, i32>, // offset
- SDTCisVT<4, i1>, // glc
- SDTCisVT<5, i1>]>; // slc
+ SDTCisVT<2, i32>, // vindex(VGPR)
+ SDTCisVT<3, i32>, // voffset(VGPR)
+ SDTCisVT<4, i32>, // soffset(SGPR)
+ SDTCisVT<5, i32>, // offset(imm)
+ SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<7, i1>]>; // idxen(imm)
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
@@ -144,13 +146,16 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
class SDBufferAtomic<string opcode> : SDNode <opcode,
- SDTypeProfile<1, 5,
+ SDTypeProfile<1, 8,
[SDTCisVT<0, i32>, // dst
SDTCisVT<1, i32>, // vdata
SDTCisVT<2, v4i32>, // rsrc
- SDTCisVT<3, i32>, // vindex
- SDTCisVT<4, i32>, // offset
- SDTCisVT<5, i1>]>, // slc
+ SDTCisVT<3, i32>, // vindex(VGPR)
+ SDTCisVT<4, i32>, // voffset(VGPR)
+ SDTCisVT<5, i32>, // soffset(SGPR)
+ SDTCisVT<6, i32>, // offset(imm)
+ SDTCisVT<7, i32>, // cachepolicy(imm)
+ SDTCisVT<8, i1>]>, // idxen(imm)
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
@@ -166,14 +171,17 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
- SDTypeProfile<1, 6,
+ SDTypeProfile<1, 9,
[SDTCisVT<0, i32>, // dst
SDTCisVT<1, i32>, // src
SDTCisVT<2, i32>, // cmp
SDTCisVT<3, v4i32>, // rsrc
- SDTCisVT<4, i32>, // vindex
- SDTCisVT<5, i32>, // offset
- SDTCisVT<6, i1>]>, // slc
+ SDTCisVT<4, i32>, // vindex(VGPR)
+ SDTCisVT<5, i32>, // voffset(VGPR)
+ SDTCisVT<6, i32>, // soffset(SGPR)
+ SDTCisVT<7, i32>, // offset(imm)
+ SDTCisVT<8, i32>, // cachepolicy(imm)
+ SDTCisVT<9, i1>]>, // idxen(imm)
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
@@ -487,24 +495,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
}]>;
class VGPRImm <dag frag> : PatLeaf<frag, [{
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- return false;
- }
- const SIRegisterInfo *SIRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
- unsigned Limit = 0;
- for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
- Limit < 10 && U != E; ++U, ++Limit) {
- const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-
- // If the register class is unknown, it could be an unknown
- // register class that needs to be an SGPR, e.g. an inline asm
- // constraint
- if (!RC || SIRI->isSGPRClass(RC))
- return false;
- }
-
- return Limit < 10;
+ return isVGPRImm(N);
}]>;
def NegateImm : SDNodeXForm<imm, [{
@@ -746,14 +737,13 @@ def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
-def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
-def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
-def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>;
def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
@@ -1632,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
0, // 64-bit dst - No DPP or SDWA for 64-bit operands
!if(!eq(Src0VT.Size, 64),
0, // 64-bit src0
- !if(!eq(Src0VT.Size, 64),
+ !if(!eq(Src1VT.Size, 64),
0, // 64-bit src2
1
)
@@ -1641,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
);
}
+class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+ ValueType Src1VT = i32> {
+ bit ret = !if(!eq(NumSrcArgs, 0), 0,
+ getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret);
+}
+
class BitOr<bit a, bit b> {
bit ret = !if(a, 1, !if(b, 1, 0));
}
@@ -1649,6 +1645,11 @@ class BitAnd<bit a, bit b> {
bit ret = !if(a, !if(b, 1, 0), 0);
}
+def PatGenMode {
+ int NoPattern = 0;
+ int Pattern = 1;
+}
+
class VOPProfile <list<ValueType> _ArgVT> {
field list<ValueType> ArgVT = _ArgVT;
@@ -1715,7 +1716,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSDWAOMod = isFloatType<DstVT>.ret;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
- field bit HasSDWA9 = HasExt;
+ field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+ field bit HasExtSDWA = HasExt;
+ field bit HasExtSDWA9 = HasExt;
+ field int NeedPatGen = PatGenMode.NoPattern;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -1743,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> {
getOpSelMod<Src0VT>.ret,
getOpSelMod<Src1VT>.ret,
getOpSelMod<Src2VT>.ret>.ret;
- field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
- HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
+ field dag InsDPP = !if(HasExtDPP,
+ getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
+ HasModifiers, Src0ModDPP, Src1ModDPP>.ret,
+ (ins));
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
@@ -1758,14 +1764,21 @@ class VOPProfile <list<ValueType> _ArgVT> {
HasSrc0FloatMods,
HasSrc1FloatMods,
HasSrc2FloatMods>.ret;
- field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+ field string AsmDPP = !if(HasExtDPP,
+ getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");
field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
}
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
- let HasSDWA9 = 0;
+ let HasExtDPP = 0;
+ let HasExtSDWA = 0;
+ let HasExtSDWA9 = 0;
+}
+
+class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> {
+ let NeedPatGen = mode;
}
def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
@@ -1788,6 +1801,8 @@ def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>;
+def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>;
def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
@@ -1925,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping {
let ValueCols = [["Default"]];
}
+// Maps ordinary instructions to their DPP counterparts
+def getDPPOp32 : InstrMapping {
+ let FilterClass = "VOP";
+ let RowFields = ["OpName"];
+ let ColFields = ["AsmVariantName"];
+ let KeyCol = ["Default"];
+ let ValueCols = [["DPP"]];
+}
+
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";
@@ -1977,6 +2001,14 @@ def getAddr64Inst : InstrMapping {
let ValueCols = [["1"]];
}
+def getIfAddr64Inst : InstrMapping {
+ let FilterClass = "MUBUFAddr64Table";
+ let RowFields = ["OpName"];
+ let ColFields = ["IsAddr64"];
+ let KeyCol = ["1"];
+ let ValueCols = [["1"]];
+}
+
def getMUBUFNoLdsInst : InstrMapping {
let FilterClass = "MUBUFLdsTable";
let RowFields = ["OpName"];
@@ -2003,6 +2035,15 @@ def getAtomicNoRetOp : InstrMapping {
let ValueCols = [["0"]];
}
+// Maps a GLOBAL to its SADDR form.
+def getGlobalSaddrOp : InstrMapping {
+ let FilterClass = "GlobalSaddrTable";
+ let RowFields = ["SaddrOp"];
+ let ColFields = ["IsSaddr"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 5c10646161b39..b6b00c2e4257a 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -15,8 +15,8 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro
let SubtargetPredicate = isGCN;
}
-include "VOPInstructions.td"
include "SOPInstructions.td"
+include "VOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
@@ -164,29 +164,26 @@ def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
} // End usesCustomInserter = 1, Defs = [SCC]
-let usesCustomInserter = 1, SALU = 1 in {
-def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
+let usesCustomInserter = 1 in {
+def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
-def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0)> {
- let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
-def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
let Defs = [SCC];
}
-def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
- let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
@@ -250,7 +247,7 @@ def SI_LOOP : CFPseudoInstSI <
(outs), (ins SReg_64:$saved, brtarget:$target),
[(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
let Size = 8;
- let isBranch = 0;
+ let isBranch = 1;
let hasSideEffects = 1;
}
@@ -267,14 +264,6 @@ def SI_END_CF : CFPseudoInstSI <
let mayStore = 1;
}
-def SI_BREAK : CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$src),
- [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
- let Size = 4;
- let isAsCheapAsAMove = 1;
- let isReMaterializable = 1;
-}
-
def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
[(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
@@ -283,14 +272,6 @@ def SI_IF_BREAK : CFPseudoInstSI <
let isReMaterializable = 1;
}
-def SI_ELSE_BREAK : CFPseudoInstSI <
- (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
- [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
- let Size = 4;
- let isAsCheapAsAMove = 1;
- let isReMaterializable = 1;
-}
-
let Uses = [EXEC] in {
multiclass PseudoInstKill <dag ins> {
@@ -326,6 +307,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI <
def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
let isTerminator = 1;
let usesCustomInserter = 1;
+ let isBranch = 1;
}
def SI_PS_LIVE : PseudoInstSI <
@@ -598,7 +580,13 @@ def : Pat <
(int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),
(SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
-// TODO: we could add more variants for other types of conditionals
+
+ // TODO: we could add more variants for other types of conditionals
+
+def : Pat <
+ (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
+ (COPY $src) // Return the SGPRs representing i1 src
+>;
//===----------------------------------------------------------------------===//
// VOP1 Patterns
@@ -730,12 +718,14 @@ defm : SelectPat <i32, V_CNDMASK_B32_e64>;
defm : SelectPat <f16, V_CNDMASK_B32_e64>;
defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+let AddedComplexity = 1 in {
def : GCNPat <
- (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+ (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
+}
def : GCNPat <
- (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)),
+ (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
@@ -867,6 +857,8 @@ def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <v4i16, v4f16, VReg_64>;
+def : BitConvert <v4f16, v4i16, VReg_64>;
// FIXME: Make SGPR
def : BitConvert <v2i32, v4f16, VReg_64>;
@@ -1324,6 +1316,38 @@ def : GCNPat <
>;
def : GCNPat <
+ (i1 (add i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
+let AddedComplexity = 1 in {
+def : GCNPat <
+ (i1 (add i1:$src0, (i1 -1))),
+ (S_NOT_B64 $src0)
+>;
+
+def : GCNPat <
+ (i1 (sub i1:$src0, (i1 -1))),
+ (S_NOT_B64 $src0)
+>;
+}
+
+def : GCNPat <
+ (f16 (sint_to_fp i1:$src)),
+ (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src))
+>;
+
+def : GCNPat <
+ (f16 (uint_to_fp i1:$src)),
+ (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src))
+>;
+
+def : GCNPat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
>;
@@ -1464,13 +1488,32 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa
def : ExpPattern<AMDGPUexport, i32, EXP>;
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
+// COPY is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
(v2i16 (build_vector (i16 0), i16:$src1)),
- (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
+ (v2i16 (COPY (S_LSHL_B32 i16:$src1, (i16 16))))
+>;
+
+def : GCNPat <
+ (v2i16 (build_vector i16:$src0, (i16 undef))),
+ (v2i16 (COPY $src0))
+>;
+
+def : GCNPat <
+ (v2f16 (build_vector f16:$src0, (f16 undef))),
+ (v2f16 (COPY $src0))
+>;
+
+def : GCNPat <
+ (v2i16 (build_vector (i16 undef), i16:$src1)),
+ (v2i16 (COPY (S_LSHL_B32 $src1, (i32 16))))
>;
+def : GCNPat <
+ (v2f16 (build_vector (f16 undef), f16:$src1)),
+ (v2f16 (COPY (S_LSHL_B32 $src1, (i32 16))))
+>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
@@ -1501,15 +1544,15 @@ def : GCNPat <
} // End SubtargetPredicate = HasVOP3PInsts
-// def : GCNPat <
-// (v2f16 (scalar_to_vector f16:$src0)),
-// (COPY $src0)
-// >;
+def : GCNPat <
+ (v2f16 (scalar_to_vector f16:$src0)),
+ (COPY $src0)
+>;
-// def : GCNPat <
-// (v2i16 (scalar_to_vector i16:$src0)),
-// (COPY $src0)
-// >;
+def : GCNPat <
+ (v2i16 (scalar_to_vector i16:$src0)),
+ (COPY $src0)
+>;
def : GCNPat <
(v4i16 (scalar_to_vector i16:$src0)),
@@ -1587,18 +1630,19 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
-def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
-def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
+defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
}
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
+ //SDPatternOperator max, SDPatternOperator min,
Instruction med3Inst> : GCNPat<
- (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
- (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
@@ -1606,28 +1650,41 @@ class FPMed3Pat<ValueType vt,
class FP16Med3Pat<ValueType vt,
Instruction med3Inst> : GCNPat<
- (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
- (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
- (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
- (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+ (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+ (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+ (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
>;
-class Int16Med3Pat<Instruction med3Inst,
+multiclass Int16Med3Pat<Instruction med3Inst,
+ SDPatternOperator min,
SDPatternOperator max,
SDPatternOperator max_oneuse,
SDPatternOperator min_oneuse,
- ValueType vt = i32> : GCNPat<
+ ValueType vt = i16> {
+ // This matches 16 permutations of
+ // max(min(x, y), min(max(x, y), z))
+ def : GCNPat <
(max (min_oneuse vt:$src0, vt:$src1),
(min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
(med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
>;
+ // This matches 16 permutations of
+ // min(max(a, b), max(min(a, b), c))
+ def : GCNPat <
+ (min (max_oneuse vt:$src0, vt:$src1),
+ (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
+ (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+>;
+}
+
def : FPMed3Pat<f32, V_MED3_F32>;
let OtherPredicates = [isGFX9] in {
def : FP16Med3Pat<f16, V_MED3_F16>;
-def : Int16Med3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
-def : Int16Med3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
+defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
} // End Predicates = [isGFX9]
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 7b7cf1635050b..e51ff4b4bc50e 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -16,36 +16,4 @@
let TargetPrefix = "SI", isTarget = 1 in {
def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
- // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
- def int_SI_tbuffer_store : Intrinsic <
- [],
- [llvm_anyint_ty, // rsrc(SGPR)
- llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
- llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
- llvm_i32_ty, // vaddr(VGPR)
- llvm_i32_ty, // soffset(SGPR)
- llvm_i32_ty, // inst_offset(imm)
- llvm_i32_ty, // dfmt(imm)
- llvm_i32_ty, // nfmt(imm)
- llvm_i32_ty, // offen(imm)
- llvm_i32_ty, // idxen(imm)
- llvm_i32_ty, // glc(imm)
- llvm_i32_ty, // slc(imm)
- llvm_i32_ty], // tfe(imm)
- []>;
-
- // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
- def int_SI_buffer_load_dword : Intrinsic <
- [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
- [llvm_anyint_ty, // rsrc(SGPR)
- llvm_anyint_ty, // vaddr(VGPR)
- llvm_i32_ty, // soffset(SGPR)
- llvm_i32_ty, // inst_offset(imm)
- llvm_i32_ty, // offen(imm)
- llvm_i32_ty, // idxen(imm)
- llvm_i32_ty, // glc(imm)
- llvm_i32_ty, // slc(imm)
- llvm_i32_ty], // tfe(imm)
- [IntrReadMem, IntrArgMemOnly]>;
-
} // End TargetPrefix = "SI", isTarget = 1
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 4b537540046fe..be291b127301d 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -20,6 +20,26 @@
// ==>
// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
//
+// This pass also tries to promote constant offset to the immediate by
+// adjusting the base. It tries to use a base from the nearby instructions that
+// allows it to have a 13bit constant offset and then promotes the 13bit offset
+// to the immediate.
+// E.g.
+// s_movk_i32 s0, 0x1800
+// v_add_co_u32_e32 v0, vcc, s0, v2
+// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+//
+// s_movk_i32 s0, 0x1000
+// v_add_co_u32_e32 v5, vcc, s0, v2
+// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+// global_load_dwordx2 v[5:6], v[5:6], off
+// global_load_dwordx2 v[0:1], v[0:1], off
+// =>
+// s_movk_i32 s0, 0x1000
+// v_add_co_u32_e32 v5, vcc, s0, v2
+// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+// global_load_dwordx2 v[5:6], v[5:6], off
+// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
//
// Future improvements:
//
@@ -43,9 +63,9 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
@@ -74,23 +94,38 @@ using namespace llvm;
#define DEBUG_TYPE "si-load-store-opt"
namespace {
+enum InstClassEnum {
+ UNKNOWN,
+ DS_READ,
+ DS_WRITE,
+ S_BUFFER_LOAD_IMM,
+ BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
+ BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
+ BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
+ BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
+ BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
+ BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+};
-class SILoadStoreOptimizer : public MachineFunctionPass {
- enum InstClassEnum {
- DS_READ_WRITE,
- S_BUFFER_LOAD_IMM,
- BUFFER_LOAD_OFFEN,
- BUFFER_LOAD_OFFSET,
- BUFFER_STORE_OFFEN,
- BUFFER_STORE_OFFSET,
- };
+enum RegisterEnum {
+ SBASE = 0x1,
+ SRSRC = 0x2,
+ SOFFSET = 0x4,
+ VADDR = 0x8,
+ ADDR = 0x10,
+};
+class SILoadStoreOptimizer : public MachineFunctionPass {
struct CombineInfo {
MachineBasicBlock::iterator I;
MachineBasicBlock::iterator Paired;
unsigned EltSize;
unsigned Offset0;
unsigned Offset1;
+ unsigned Width0;
+ unsigned Width1;
unsigned BaseOff;
InstClassEnum InstClass;
bool GLC0;
@@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool SLC0;
bool SLC1;
bool UseST64;
- bool IsX2;
- SmallVector<MachineInstr*, 8> InstsToMove;
- };
+ SmallVector<MachineInstr *, 8> InstsToMove;
+ };
+
+ struct BaseRegisters {
+ unsigned LoReg = 0;
+ unsigned HiReg = 0;
+
+ unsigned LoSubReg = 0;
+ unsigned HiSubReg = 0;
+ };
+
+ struct MemAddress {
+ BaseRegisters Base;
+ int64_t Offset = 0;
+ };
+
+ using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
private:
const GCNSubtarget *STM = nullptr;
@@ -108,9 +157,16 @@ private:
const SIRegisterInfo *TRI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
- unsigned CreatedX2;
+ bool OptimizeAgain;
static bool offsetsCanBeCombined(CombineInfo &CI);
+ static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
+ static unsigned getNewOpcode(const CombineInfo &CI);
+ static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
+ const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
+ unsigned getOpcodeWidth(const MachineInstr &MI);
+ InstClassEnum getInstClass(unsigned Opc);
+ unsigned getRegs(unsigned Opc);
bool findMatchingInst(CombineInfo &CI);
@@ -123,10 +179,21 @@ private:
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
- unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
- bool &IsOffen) const;
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
+ void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
+ int32_t NewOffset);
+ unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
+ MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
+ Optional<int32_t> extractConstOffset(const MachineOperand &Op);
+ void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
+ /// Promotes constant offset to the immediate by adjusting the base. It
+ /// tries to use a base from the nearby instructions that allows it to have
+ /// a 13bit constant offset which gets promoted to the immediate.
+ bool promoteConstantOffsetToImm(MachineInstr &CI,
+ MemInfoMap &Visited,
+ SmallPtrSet<MachineInstr *, 4> &Promoted);
+
public:
static char ID;
@@ -153,8 +220,8 @@ public:
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
- "SI Load Store Optimizer", false, false)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
+ false, false)
char SILoadStoreOptimizer::ID = 0;
@@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
}
static void moveInstsAfter(MachineBasicBlock::iterator I,
- ArrayRef<MachineInstr*> InstsToMove) {
+ ArrayRef<MachineInstr *> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
++I;
for (MachineInstr *MI : InstsToMove) {
@@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI,
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
const SIInstrInfo *TII,
- AliasAnalysis * AA) {
+ AliasAnalysis *AA) {
// RAW or WAR - cannot reorder
// WAW - cannot reorder
// RAR - safe to reorder
return !(A->mayStore() || B->mayStore()) ||
- TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+ TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
// already in the list. Returns true in that case.
-static bool
-addToListsIfDependent(MachineInstr &MI,
- DenseSet<unsigned> &RegDefs,
- DenseSet<unsigned> &PhysRegUses,
- SmallVectorImpl<MachineInstr*> &Insts) {
+static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
+ DenseSet<unsigned> &PhysRegUses,
+ SmallVectorImpl<MachineInstr *> &Insts) {
for (MachineOperand &Use : MI.operands()) {
// If one of the defs is read, then there is a use of Def between I and the
// instruction that I will potentially be merged with. We will need to move
@@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI,
return false;
}
-static bool
-canMoveInstsAcrossMemOp(MachineInstr &MemOp,
- ArrayRef<MachineInstr*> InstsToMove,
- const SIInstrInfo *TII,
- AliasAnalysis *AA) {
+static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr *> InstsToMove,
+ const SIInstrInfo *TII, AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore());
for (MachineInstr *InstToMove : InstsToMove) {
if (!InstToMove->mayLoadOrStore())
continue;
if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
- return false;
+ return false;
}
return true;
}
@@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
CI.BaseOff = 0;
// Handle SMEM and VMEM instructions.
- if (CI.InstClass != DS_READ_WRITE) {
- unsigned Diff = CI.IsX2 ? 2 : 1;
- return (EltOffset0 + Diff == EltOffset1 ||
- EltOffset1 + Diff == EltOffset0) &&
+ if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
+ return (EltOffset0 + CI.Width0 == EltOffset1 ||
+ EltOffset1 + CI.Width1 == EltOffset0) &&
CI.GLC0 == CI.GLC1 &&
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
}
@@ -305,42 +367,176 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
return false;
}
+bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
+ const CombineInfo &CI) {
+ const unsigned Width = (CI.Width0 + CI.Width1);
+ switch (CI.InstClass) {
+ default:
+ return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
+ case S_BUFFER_LOAD_IMM:
+ switch (Width) {
+ default:
+ return false;
+ case 2:
+ case 4:
+ return true;
+ }
+ }
+}
+
+unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
+ const unsigned Opc = MI.getOpcode();
+
+ if (TII->isMUBUF(MI)) {
+ return AMDGPU::getMUBUFDwords(Opc);
+ }
+
+ switch (Opc) {
+ default:
+ return 0;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ return 1;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return 4;
+ }
+}
+
+InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
+ if (TII->isMUBUF(Opc)) {
+ const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
+
+ // If we couldn't identify the opcode, bail out.
+ if (baseOpcode == -1) {
+ return UNKNOWN;
+ }
+
+ switch (baseOpcode) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ return BUFFER_LOAD_OFFEN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+ return BUFFER_LOAD_OFFSET;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ return BUFFER_STORE_OFFEN;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ return BUFFER_STORE_OFFSET;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+ return BUFFER_LOAD_OFFEN_exact;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ return BUFFER_LOAD_OFFSET_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ return BUFFER_STORE_OFFEN_exact;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return BUFFER_STORE_OFFSET_exact;
+ }
+ }
+
+ switch (Opc) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return S_BUFFER_LOAD_IMM;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64_gfx9:
+ return DS_READ;
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return DS_WRITE;
+ }
+}
+
+unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
+ if (TII->isMUBUF(Opc)) {
+ unsigned result = 0;
+
+ if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+ result |= VADDR;
+ }
+
+ if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+ result |= SRSRC;
+ }
+
+ if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+ result |= SOFFSET;
+ }
+
+ return result;
+ }
+
+ switch (Opc) {
+ default:
+ return 0;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return SBASE;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return ADDR;
+ }
+}
+
bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
- unsigned AddrOpName[3] = {0};
- int AddrIdx[3];
- const MachineOperand *AddrReg[3];
+ const unsigned Opc = CI.I->getOpcode();
+ const InstClassEnum InstClass = getInstClass(Opc);
+
+ if (InstClass == UNKNOWN) {
+ return false;
+ }
+
+ const unsigned Regs = getRegs(Opc);
+
+ unsigned AddrOpName[5] = {0};
+ int AddrIdx[5];
+ const MachineOperand *AddrReg[5];
unsigned NumAddresses = 0;
- switch (CI.InstClass) {
- case DS_READ_WRITE:
+ if (Regs & ADDR) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
- break;
- case S_BUFFER_LOAD_IMM:
+ }
+
+ if (Regs & SBASE) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
- break;
- case BUFFER_LOAD_OFFEN:
- case BUFFER_STORE_OFFEN:
- AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
- AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
- AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- break;
- case BUFFER_LOAD_OFFSET:
- case BUFFER_STORE_OFFSET:
+ }
+
+ if (Regs & SRSRC) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ }
+
+ if (Regs & SOFFSET) {
AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- break;
+ }
+
+ if (Regs & VADDR) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
}
for (unsigned i = 0; i < NumAddresses; i++) {
AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
- // We only ever merge operations with the same base address register, so don't
- // bother scanning forward if there are no other uses.
+ // We only ever merge operations with the same base address register, so
+ // don't bother scanning forward if there are no other uses.
if (AddrReg[i]->isReg() &&
(TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
@@ -353,8 +549,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
DenseSet<unsigned> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
- for ( ; MBBI != E; ++MBBI) {
- if (MBBI->getOpcode() != CI.I->getOpcode()) {
+ for (; MBBI != E; ++MBBI) {
+ const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
+
+ if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
+ (IsDS && (MBBI->getOpcode() != Opc))) {
// This is not a matching DS instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
@@ -368,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
}
if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
@@ -413,8 +612,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
continue;
}
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
+ // Check same base pointer. Be careful of subregisters, which can occur
+ // with vectors of pointers.
if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
Match = false;
@@ -423,13 +622,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
}
if (Match) {
- int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
- AMDGPU::OpName::offset);
+ int OffsetIdx =
+ AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
+ CI.Width0 = getOpcodeWidth(*CI.I);
CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
+ CI.Width1 = getOpcodeWidth(*MBBI);
CI.Paired = MBBI;
- if (CI.InstClass == DS_READ_WRITE) {
+ if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
CI.Offset0 &= 0xffff;
CI.Offset1 &= 0xffff;
} else {
@@ -445,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
- if (offsetsCanBeCombined(CI))
+ if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
return true;
}
@@ -472,12 +673,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
- return (EltSize == 4) ?
- AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+ return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
+ : AMDGPU::DS_READ2ST64_B64_gfx9;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -489,8 +690,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = CI.UseST64 ?
- read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
+ unsigned Opc =
+ CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -502,39 +703,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
}
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
- (NewOffset0 != NewOffset1) &&
- "Computed offset doesn't fit");
+ (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
const MCInstrDesc &Read2Desc = TII->get(Opc);
- const TargetRegisterClass *SuperRC
- = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ const TargetRegisterClass *SuperRC =
+ (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
unsigned BaseReg = AddrReg->getReg();
+ unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
- .addImm(CI.BaseOff);
+ .addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
- .addReg(ImmReg)
- .addReg(AddrReg->getReg());
+ .addReg(ImmReg)
+ .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ BaseSubReg = 0;
}
MachineInstrBuilder Read2 =
- BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
- .addReg(BaseReg, BaseRegFlags) // addr
- .addImm(NewOffset0) // offset0
- .addImm(NewOffset1) // offset1
- .addImm(0) // gds
- .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+ BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+ .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addImm(0) // gds
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
(void)Read2;
@@ -561,32 +763,36 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
- return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
+ : AMDGPU::DS_WRITE2_B64_gfx9;
}
unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
if (STM->ldsRequiresM0Init())
- return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
+ : AMDGPU::DS_WRITE2ST64_B64;
- return (EltSize == 4) ?
- AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+ return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
+ : AMDGPU::DS_WRITE2ST64_B64_gfx9;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
// sure we preserve the subregister index and any register flags set on them.
- const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
- const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
- const MachineOperand *Data1
- = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
+ const MachineOperand *AddrReg =
+ TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+ const MachineOperand *Data0 =
+ TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
+ const MachineOperand *Data1 =
+ TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
unsigned NewOffset0 = CI.Offset0;
unsigned NewOffset1 = CI.Offset1;
- unsigned Opc = CI.UseST64 ?
- write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
+ unsigned Opc =
+ CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
if (NewOffset0 > NewOffset1) {
// Canonicalize the merged instruction so the smaller offset comes first.
@@ -595,36 +801,37 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
}
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
- (NewOffset0 != NewOffset1) &&
- "Computed offset doesn't fit");
+ (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
unsigned BaseReg = AddrReg->getReg();
+ unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
- .addImm(CI.BaseOff);
+ .addImm(CI.BaseOff);
BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BaseRegFlags = RegState::Kill;
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
- .addReg(ImmReg)
- .addReg(AddrReg->getReg());
+ .addReg(ImmReg)
+ .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ BaseSubReg = 0;
}
MachineInstrBuilder Write2 =
- BuildMI(*MBB, CI.Paired, DL, Write2Desc)
- .addReg(BaseReg, BaseRegFlags) // addr
- .add(*Data0) // data0
- .add(*Data1) // data1
- .addImm(NewOffset0) // offset0
- .addImm(NewOffset1) // offset1
- .addImm(0) // gds
- .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+ BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+ .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
+ .add(*Data0) // data0
+ .add(*Data1) // data1
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addImm(0) // gds
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
moveInstsAfter(Write2, CI.InstsToMove);
@@ -636,15 +843,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
return Next;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
- AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ const unsigned Opcode = getNewOpcode(CI);
+
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
- const TargetRegisterClass *SuperRC =
- CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
@@ -652,14 +858,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
.addImm(CI.GLC0) // glc
- .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
- unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
- // Handle descending offsets
- if (CI.Offset0 > CI.Offset1)
- std::swap(SubRegIdx0, SubRegIdx1);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -681,29 +884,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
return Next;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
- CombineInfo &CI) {
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- unsigned Opcode;
- if (CI.InstClass == BUFFER_LOAD_OFFEN) {
- Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
- AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
- } else {
- Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
- AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
- }
+ const unsigned Opcode = getNewOpcode(CI);
- const TargetRegisterClass *SuperRC =
- CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+ // Copy to the new source register.
unsigned DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
- if (CI.InstClass == BUFFER_LOAD_OFFEN)
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+ const unsigned Regs = getRegs(Opcode);
+
+ if (Regs & VADDR)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -711,14 +910,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
.addImm(CI.GLC0) // glc
.addImm(CI.SLC0) // slc
.addImm(0) // tfe
- .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
- unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
-
- // Handle descending offsets
- if (CI.Offset0 > CI.Offset1)
- std::swap(SubRegIdx0, SubRegIdx1);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the old destination registers.
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
@@ -740,57 +936,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
return Next;
}
-unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
- const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
- IsX2 = false;
- IsOffen = false;
+unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
+ const unsigned Width = CI.Width0 + CI.Width1;
- switch (I.getOpcode()) {
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
- IsX2 = true;
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
- IsX2 = true;
- IsOffen = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
- return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
- IsX2 = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
- IsX2 = true;
- return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
+ switch (CI.InstClass) {
+ default:
+ return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+ case UNKNOWN:
+ llvm_unreachable("Unknown instruction class");
+ case S_BUFFER_LOAD_IMM:
+ switch (Width) {
+ default:
+ return 0;
+ case 2:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ case 4:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ }
}
- return 0;
}
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
- CombineInfo &CI) {
+std::pair<unsigned, unsigned>
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
+ if (CI.Offset0 > CI.Offset1) {
+ switch (CI.Width0) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
+ case 2:
+ return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
+ case 3:
+ return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
+ }
+ case 2:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
+ case 2:
+ return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
+ }
+ case 3:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
+ }
+ }
+ } else {
+ switch (CI.Width0) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
+ case 2:
+ return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
+ case 3:
+ return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
+ }
+ case 2:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
+ case 2:
+ return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
+ }
+ case 3:
+ switch (CI.Width1) {
+ default:
+ return std::make_pair(0, 0);
+ case 1:
+ return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
+ }
+ }
+ }
+}
+
+const TargetRegisterClass *
+SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
+ if (CI.InstClass == S_BUFFER_LOAD_IMM) {
+ switch (CI.Width0 + CI.Width1) {
+ default:
+ return nullptr;
+ case 2:
+ return &AMDGPU::SReg_64_XEXECRegClass;
+ case 4:
+ return &AMDGPU::SReg_128RegClass;
+ case 8:
+ return &AMDGPU::SReg_256RegClass;
+ case 16:
+ return &AMDGPU::SReg_512RegClass;
+ }
+ } else {
+ switch (CI.Width0 + CI.Width1) {
+ default:
+ return nullptr;
+ case 2:
+ return &AMDGPU::VReg_64RegClass;
+ case 3:
+ return &AMDGPU::VReg_96RegClass;
+ case 4:
+ return &AMDGPU::VReg_128RegClass;
+ }
+ }
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
- bool Unused1, Unused2;
- unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
- unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
- unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
+ const unsigned Opcode = getNewOpcode(CI);
- // Handle descending offsets
- if (CI.Offset0 > CI.Offset1)
- std::swap(SubRegIdx0, SubRegIdx1);
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
// Copy to the new source register.
- const TargetRegisterClass *SuperRC =
- CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
@@ -803,18 +1079,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
.addImm(SubRegIdx1);
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
- .addReg(SrcReg, RegState::Kill);
+ .addReg(SrcReg, RegState::Kill);
- if (CI.InstClass == BUFFER_STORE_OFFEN)
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
+ const unsigned Regs = getRegs(Opcode);
+
+ if (Regs & VADDR)
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset0, CI.Offset1)) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
- .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
moveInstsAfter(MIB, CI.InstsToMove);
@@ -824,105 +1102,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
return Next;
}
+MachineOperand
+SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
+ APInt V(32, Val, true);
+ if (TII->isInlineConstant(V))
+ return MachineOperand::CreateImm(Val);
+
+ unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ MachineInstr *Mov =
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B32), Reg)
+ .addImm(Val);
+ (void)Mov;
+ LLVM_DEBUG(dbgs() << " "; Mov->dump());
+ return MachineOperand::CreateReg(Reg, false);
+}
+
+// Compute base address using Addr and return the final register.
+unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
+ const MemAddress &Addr) {
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ DebugLoc DL = MI.getDebugLoc();
+
+ assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
+ Addr.Base.LoSubReg) &&
+ "Expected 32-bit Base-Register-Low!!");
+
+ assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
+ Addr.Base.HiSubReg) &&
+ "Expected 32-bit Base-Register-Hi!!");
+
+ LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
+ MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
+ MachineOperand OffsetHi =
+ createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
+ unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ unsigned DeadCarryReg =
+ MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+
+ unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *LoHalf =
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
+ .addReg(CarryReg, RegState::Define)
+ .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
+ .add(OffsetLo);
+ (void)LoHalf;
+ LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
+
+ MachineInstr *HiHalf =
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+ .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+ .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
+ .add(OffsetHi)
+ .addReg(CarryReg, RegState::Kill);
+ (void)HiHalf;
+ LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
+
+ unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ MachineInstr *FullBase =
+ BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ (void)FullBase;
+ LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
+
+ return FullDestReg;
+}
+
+// Update base and offset with the NewBase and NewOffset in MI.
+void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
+ unsigned NewBase,
+ int32_t NewOffset) {
+ TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
+ TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
+}
+
+Optional<int32_t>
+SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
+ if (Op.isImm())
+ return Op.getImm();
+
+ if (!Op.isReg())
+ return None;
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+ if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
+ !Def->getOperand(1).isImm())
+ return None;
+
+ return Def->getOperand(1).getImm();
+}
+
+// Analyze Base and extracts:
+// - 32bit base registers, subregisters
+// - 64bit constant offset
+// Expecting base computation as:
+// %OFFSET0:sgpr_32 = S_MOV_B32 8000
+// %LO:vgpr_32, %c:sreg_64_xexec =
+// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
+// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
+// %Base:vreg_64 =
+// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
+void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
+ MemAddress &Addr) {
+ if (!Base.isReg())
+ return;
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
+ if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
+ || Def->getNumOperands() != 5)
+ return;
+
+ MachineOperand BaseLo = Def->getOperand(1);
+ MachineOperand BaseHi = Def->getOperand(3);
+ if (!BaseLo.isReg() || !BaseHi.isReg())
+ return;
+
+ MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
+ MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
+
+ if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
+ !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
+ return;
+
+ const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
+ const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
+
+ auto Offset0P = extractConstOffset(*Src0);
+ if (Offset0P)
+ BaseLo = *Src1;
+ else {
+ if (!(Offset0P = extractConstOffset(*Src1)))
+ return;
+ BaseLo = *Src0;
+ }
+
+ Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
+ Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
+
+ if (Src0->isImm())
+ std::swap(Src0, Src1);
+
+ if (!Src1->isImm())
+ return;
+
+ uint64_t Offset1 = Src1->getImm();
+ BaseHi = *Src0;
+
+ Addr.Base.LoReg = BaseLo.getReg();
+ Addr.Base.HiReg = BaseHi.getReg();
+ Addr.Base.LoSubReg = BaseLo.getSubReg();
+ Addr.Base.HiSubReg = BaseHi.getSubReg();
+ Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
+}
+
+bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
+ MachineInstr &MI,
+ MemInfoMap &Visited,
+ SmallPtrSet<MachineInstr *, 4> &AnchorList) {
+
+ // TODO: Support flat and scratch.
+ if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+ return false;
+
+ // TODO: Support Store.
+ if (!MI.mayLoad())
+ return false;
+
+ if (AnchorList.count(&MI))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
+
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
+ LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
+ return false;
+ }
+
+ // Step1: Find the base-registers and a 64bit constant offset.
+ MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+ MemAddress MAddr;
+ if (Visited.find(&MI) == Visited.end()) {
+ processBaseWithConstOffset(Base, MAddr);
+ Visited[&MI] = MAddr;
+ } else
+ MAddr = Visited[&MI];
+
+ if (MAddr.Offset == 0) {
+ LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
+ " constant offsets that can be promoted.\n";);
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
+ << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
+
+ // Step2: Traverse through MI's basic block and find an anchor(that has the
+ // same base-registers) with the highest 13bit distance from MI's offset.
+ // E.g. (64bit loads)
+ // bb:
+ // addr1 = &a + 4096; load1 = load(addr1, 0)
+ // addr2 = &a + 6144; load2 = load(addr2, 0)
+ // addr3 = &a + 8192; load3 = load(addr3, 0)
+ // addr4 = &a + 10240; load4 = load(addr4, 0)
+ // addr5 = &a + 12288; load5 = load(addr5, 0)
+ //
+ // Starting from the first load, the optimization will try to find a new base
+ // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
+ // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
+ // as the new-base(anchor) because of the maximum distance which can
+ // accomodate more intermediate bases presumeably.
+ //
+ // Step3: move (&a + 8192) above load1. Compute and promote offsets from
+ // (&a + 8192) for load1, load2, load4.
+ // addr = &a + 8192
+ // load1 = load(addr, -4096)
+ // load2 = load(addr, -2048)
+ // load3 = load(addr, 0)
+ // load4 = load(addr, 2048)
+ // addr5 = &a + 12288; load5 = load(addr5, 0)
+ //
+ MachineInstr *AnchorInst = nullptr;
+ MemAddress AnchorAddr;
+ uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
+ SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
+
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock::iterator E = MBB->end();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ ++MBBI;
+ const SITargetLowering *TLI =
+ static_cast<const SITargetLowering *>(STM->getTargetLowering());
+
+ for ( ; MBBI != E; ++MBBI) {
+ MachineInstr &MINext = *MBBI;
+ // TODO: Support finding an anchor(with same base) from store addresses or
+ // any other load addresses where the opcodes are different.
+ if (MINext.getOpcode() != MI.getOpcode() ||
+ TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
+ continue;
+
+ const MachineOperand &BaseNext =
+ *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
+ MemAddress MAddrNext;
+ if (Visited.find(&MINext) == Visited.end()) {
+ processBaseWithConstOffset(BaseNext, MAddrNext);
+ Visited[&MINext] = MAddrNext;
+ } else
+ MAddrNext = Visited[&MINext];
+
+ if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
+ MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
+ MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
+ MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
+ continue;
+
+ InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
+
+ int64_t Dist = MAddr.Offset - MAddrNext.Offset;
+ TargetLoweringBase::AddrMode AM;
+ AM.HasBaseReg = true;
+ AM.BaseOffs = Dist;
+ if (TLI->isLegalGlobalAddressingMode(AM) &&
+ (uint32_t)std::abs(Dist) > MaxDist) {
+ MaxDist = std::abs(Dist);
+
+ AnchorAddr = MAddrNext;
+ AnchorInst = &MINext;
+ }
+ }
+
+ if (AnchorInst) {
+ LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
+ AnchorInst->dump());
+ LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
+ << AnchorAddr.Offset << "\n\n");
+
+ // Instead of moving up, just re-compute anchor-instruction's base address.
+ unsigned Base = computeBase(MI, AnchorAddr);
+
+ updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
+ LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
+
+ for (auto P : InstsWCommonBase) {
+ TargetLoweringBase::AddrMode AM;
+ AM.HasBaseReg = true;
+ AM.BaseOffs = P.second - AnchorAddr.Offset;
+
+ if (TLI->isLegalGlobalAddressingMode(AM)) {
+ LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
+ dbgs() << ")"; P.first->dump());
+ updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
+ LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
+ }
+ }
+ AnchorList.insert(AnchorInst);
+ return true;
+ }
+
+ return false;
+}
+
// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
bool Modified = false;
+ // Contain the list
+ MemInfoMap Visited;
+ // Contains the list of instructions for which constant offsets are being
+ // promoted to the IMM.
+ SmallPtrSet<MachineInstr *, 4> AnchorList;
+
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
MachineInstr &MI = *I;
+ if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
+ Modified = true;
+
// Don't combine if volatile.
if (MI.hasOrderedMemoryRef()) {
++I;
continue;
}
+ const unsigned Opc = MI.getOpcode();
+
CombineInfo CI;
CI.I = I;
- unsigned Opc = MI.getOpcode();
- if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
- Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+ CI.InstClass = getInstClass(Opc);
- CI.InstClass = DS_READ_WRITE;
+ switch (CI.InstClass) {
+ default:
+ break;
+ case DS_READ:
CI.EltSize =
- (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
-
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+ : 4;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeRead2Pair(CI);
} else {
++I;
}
-
continue;
- } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
- Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
- Opc == AMDGPU::DS_WRITE_B64_gfx9) {
- CI.InstClass = DS_READ_WRITE;
- CI.EltSize
- = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
-
+ case DS_WRITE:
+ CI.EltSize =
+ (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+ : 4;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeWrite2Pair(CI);
} else {
++I;
}
-
continue;
- }
- if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
- Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
- // EltSize is in units of the offset encoding.
- CI.InstClass = S_BUFFER_LOAD_IMM;
+ case S_BUFFER_LOAD_IMM:
CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
- CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeSBufferLoadImmPair(CI);
- if (!CI.IsX2)
- CreatedX2++;
+ OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
} else {
++I;
}
continue;
- }
- if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
- if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
- CI.InstClass = BUFFER_LOAD_OFFEN;
- else
- CI.InstClass = BUFFER_LOAD_OFFSET;
-
+ case BUFFER_LOAD_OFFEN:
+ case BUFFER_LOAD_OFFSET:
+ case BUFFER_LOAD_OFFEN_exact:
+ case BUFFER_LOAD_OFFSET_exact:
CI.EltSize = 4;
- CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
- Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeBufferLoadPair(CI);
- if (!CI.IsX2)
- CreatedX2++;
+ OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
} else {
++I;
}
continue;
- }
-
- bool StoreIsX2, IsOffen;
- if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
- CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
+ case BUFFER_STORE_OFFEN:
+ case BUFFER_STORE_OFFSET:
+ case BUFFER_STORE_OFFEN_exact:
+ case BUFFER_STORE_OFFSET_exact:
CI.EltSize = 4;
- CI.IsX2 = StoreIsX2;
if (findMatchingInst(CI)) {
Modified = true;
I = mergeBufferStorePair(CI);
- if (!CI.IsX2)
- CreatedX2++;
+ OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
} else {
++I;
}
@@ -956,12 +1528,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
bool Modified = false;
for (MachineBasicBlock &MBB : MF) {
- CreatedX2 = 0;
- Modified |= optimizeBlock(MBB);
-
- // Run again to convert x2 to x4.
- if (CreatedX2 >= 1)
+ do {
+ OptimizeAgain = false;
Modified |= optimizeBlock(MBB);
+ } while (OptimizeAgain);
}
return Modified;
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ad30317c344c3..1aa1feebbdae6 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -85,9 +85,7 @@ private:
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
- void emitBreak(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
- void emitElseBreak(MachineInstr &MI);
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
@@ -329,20 +327,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
}
-void SILowerControlFlow::emitBreak(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
-
- MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .add(MI.getOperand(1));
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *Or);
- MI.eraseFromParent();
-}
-
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -384,11 +368,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
- // Lowered in the same way as emitIfBreak above.
- emitIfBreak(MI);
-}
-
void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
@@ -515,18 +494,10 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
emitElse(MI);
break;
- case AMDGPU::SI_BREAK:
- emitBreak(MI);
- break;
-
case AMDGPU::SI_IF_BREAK:
emitIfBreak(MI);
break;
- case AMDGPU::SI_ELSE_BREAK:
- emitElseBreak(MI);
- break;
-
case AMDGPU::SI_LOOP:
emitLoop(MI);
break;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index ecc6cff407e18..eb038bb5d5fcf 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -5,37 +5,61 @@
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
-/// i1 values are usually inserted by the CFG Structurize pass and they are
-/// unique in that they can be copied from VALU to SALU registers.
-/// This is not possible for any other value type. Since there are no
-/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
-///
//===----------------------------------------------------------------------===//
//
+// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
+// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
+// and a wave-level control flow graph.
+//
+// Before this pass, values that are semantically i1 and are defined and used
+// within the same basic block are already represented as lane masks in scalar
+// registers. However, values that cross basic blocks are always transferred
+// between basic blocks in vreg_1 virtual registers and are lowered by this
+// pass.
+//
+// The only instructions that use or define vreg_1 virtual registers are COPY,
+// PHI, and IMPLICIT_DEF.
+//
+//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "si-i1-copies"
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPULaneDominator.h"
-#include "llvm/CodeGen/LiveIntervals.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-i1-copies"
+
using namespace llvm;
+static unsigned createLaneMaskReg(MachineFunction &MF);
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+
namespace {
class SILowerI1Copies : public MachineFunctionPass {
public:
static char ID;
+private:
+ MachineFunction *MF = nullptr;
+ MachineDominatorTree *DT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+
+ DenseSet<unsigned> ConstrainRegs;
+
public:
SILowerI1Copies() : MachineFunctionPass(ID) {
initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
@@ -47,14 +71,337 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+private:
+ void lowerCopiesFromI1();
+ void lowerPhis();
+ void lowerCopiesToI1();
+ bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+ void buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned DstReg, unsigned PrevReg, unsigned CurReg);
+ MachineBasicBlock::iterator
+ getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+ bool isLaneMaskReg(unsigned Reg) const {
+ return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+ TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+ ST->getWavefrontSize();
+ }
+};
+
+/// Helper class that determines the relationship between incoming values of a
+/// phi in the control flow graph to determine where an incoming value can
+/// simply be taken as a scalar lane mask as-is, and where it needs to be
+/// merged with another, previously defined lane mask.
+///
+/// The approach is as follows:
+/// - Determine all basic blocks which, starting from the incoming blocks,
+/// a wave may reach before entering the def block (the block containing the
+/// phi).
+/// - If an incoming block has no predecessors in this set, we can take the
+/// incoming value as a scalar lane mask as-is.
+/// -- A special case of this is when the def block has a self-loop.
+/// - Otherwise, the incoming value needs to be merged with a previously
+/// defined lane mask.
+/// - If there is a path into the set of reachable blocks that does _not_ go
+/// through an incoming block where we can take the scalar lane mask as-is,
+/// we need to invent an available value for the SSAUpdater. Choices are
+/// 0 and undef, with differing consequences for how to merge values etc.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+/// the traversal.
+///
+class PhiIncomingAnalysis {
+ MachinePostDominatorTree &PDT;
+
+ // For each reachable basic block, whether it is a source in the induced
+ // subgraph of the CFG.
+ DenseMap<MachineBasicBlock *, bool> ReachableMap;
+ SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
+ SmallVector<MachineBasicBlock *, 4> Stack;
+ SmallVector<MachineBasicBlock *, 4> Predecessors;
+
+public:
+ PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
+
+ /// Returns whether \p MBB is a source in the induced subgraph of reachable
+ /// blocks.
+ bool isSource(MachineBasicBlock &MBB) const {
+ return ReachableMap.find(&MBB)->second;
+ }
+
+ ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
+
+ void analyze(MachineBasicBlock &DefBlock,
+ ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+ assert(Stack.empty());
+ ReachableMap.clear();
+ ReachableOrdered.clear();
+ Predecessors.clear();
+
+ // Insert the def block first, so that it acts as an end point for the
+ // traversal.
+ ReachableMap.try_emplace(&DefBlock, false);
+ ReachableOrdered.push_back(&DefBlock);
+
+ for (MachineBasicBlock *MBB : IncomingBlocks) {
+ if (MBB == &DefBlock) {
+ ReachableMap[&DefBlock] = true; // self-loop on DefBlock
+ continue;
+ }
+
+ ReachableMap.try_emplace(MBB, false);
+ ReachableOrdered.push_back(MBB);
+
+ // If this block has a divergent terminator and the def block is its
+ // post-dominator, the wave may first visit the other successors.
+ bool Divergent = false;
+ for (MachineInstr &MI : MBB->terminators()) {
+ if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
+ MI.getOpcode() == AMDGPU::SI_IF ||
+ MI.getOpcode() == AMDGPU::SI_ELSE ||
+ MI.getOpcode() == AMDGPU::SI_LOOP) {
+ Divergent = true;
+ break;
+ }
+ }
+
+ if (Divergent && PDT.dominates(&DefBlock, MBB)) {
+ for (MachineBasicBlock *Succ : MBB->successors())
+ Stack.push_back(Succ);
+ }
+ }
+
+ while (!Stack.empty()) {
+ MachineBasicBlock *MBB = Stack.pop_back_val();
+ if (!ReachableMap.try_emplace(MBB, false).second)
+ continue;
+ ReachableOrdered.push_back(MBB);
+
+ for (MachineBasicBlock *Succ : MBB->successors())
+ Stack.push_back(Succ);
+ }
+
+ for (MachineBasicBlock *MBB : ReachableOrdered) {
+ bool HaveReachablePred = false;
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (ReachableMap.count(Pred)) {
+ HaveReachablePred = true;
+ } else {
+ Stack.push_back(Pred);
+ }
+ }
+ if (!HaveReachablePred)
+ ReachableMap[MBB] = true;
+ if (HaveReachablePred) {
+ for (MachineBasicBlock *UnreachablePred : Stack) {
+ if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+ Predecessors.push_back(UnreachablePred);
+ }
+ }
+ Stack.clear();
+ }
+ }
+};
+
+/// Helper class that detects loops which require us to lower an i1 COPY into
+/// bitwise manipulation.
+///
+/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
+/// between loops with the same header. Consider this example:
+///
+/// A-+-+
+/// | | |
+/// B-+ |
+/// | |
+/// C---+
+///
+/// A is the header of a loop containing A, B, and C as far as LoopInfo is
+/// concerned. However, an i1 COPY in B that is used in C must be lowered to
+/// bitwise operations to combine results from different loop iterations when
+/// B has a divergent branch (since by default we will compile this code such
+/// that threads in a wave are merged at the entry of C).
+///
+/// The following rule is implemented to determine whether bitwise operations
+/// are required: use the bitwise lowering for a def in block B if a backward
+/// edge to B is reachable without going through the nearest common
+/// post-dominator of B and all uses of the def.
+///
+/// TODO: This rule is conservative because it does not check whether the
+/// relevant branches are actually divergent.
+///
+/// The class is designed to cache the CFG traversal so that it can be re-used
+/// for multiple defs within the same basic block.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+/// the traversal.
+///
+class LoopFinder {
+ MachineDominatorTree &DT;
+ MachinePostDominatorTree &PDT;
+
+ // All visited / reachable block, tagged by level (level 0 is the def block,
+ // level 1 are all blocks reachable including but not going through the def
+ // block's IPDOM, etc.).
+ DenseMap<MachineBasicBlock *, unsigned> Visited;
+
+ // Nearest common dominator of all visited blocks by level (level 0 is the
+ // def block). Used for seeding the SSAUpdater.
+ SmallVector<MachineBasicBlock *, 4> CommonDominators;
+
+ // Post-dominator of all visited blocks.
+ MachineBasicBlock *VisitedPostDom = nullptr;
+
+ // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
+ // reachable without going through the IPDOM of the def block (if the IPDOM
+ // itself has an edge to the def block, the loop level is 2), etc.
+ unsigned FoundLoopLevel = ~0u;
+
+ MachineBasicBlock *DefBlock = nullptr;
+ SmallVector<MachineBasicBlock *, 4> Stack;
+ SmallVector<MachineBasicBlock *, 4> NextLevel;
+
+public:
+ LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
+ : DT(DT), PDT(PDT) {}
+
+ void initialize(MachineBasicBlock &MBB) {
+ Visited.clear();
+ CommonDominators.clear();
+ Stack.clear();
+ NextLevel.clear();
+ VisitedPostDom = nullptr;
+ FoundLoopLevel = ~0u;
+
+ DefBlock = &MBB;
+ }
+
+ /// Check whether a backward edge can be reached without going through the
+ /// given \p PostDom of the def block.
+ ///
+ /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
+ unsigned findLoop(MachineBasicBlock *PostDom) {
+ MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
+
+ if (!VisitedPostDom)
+ advanceLevel();
+
+ unsigned Level = 0;
+ while (PDNode->getBlock() != PostDom) {
+ if (PDNode->getBlock() == VisitedPostDom)
+ advanceLevel();
+ PDNode = PDNode->getIDom();
+ Level++;
+ if (FoundLoopLevel == Level)
+ return Level;
+ }
+
+ return 0;
+ }
+
+ /// Add undef values dominating the loop and the optionally given additional
+ /// blocks, so that the SSA updater doesn't have to search all the way to the
+ /// function entry.
+ void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
+ ArrayRef<MachineBasicBlock *> Blocks = {}) {
+ assert(LoopLevel < CommonDominators.size());
+
+ MachineBasicBlock *Dom = CommonDominators[LoopLevel];
+ for (MachineBasicBlock *MBB : Blocks)
+ Dom = DT.findNearestCommonDominator(Dom, MBB);
+
+ if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
+ SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+ } else {
+ // The dominator is part of the loop or the given blocks, so add the
+ // undef value to unreachable predecessors instead.
+ for (MachineBasicBlock *Pred : Dom->predecessors()) {
+ if (!inLoopLevel(*Pred, LoopLevel, Blocks))
+ SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+ }
+ }
+ }
+
+private:
+ bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
+ ArrayRef<MachineBasicBlock *> Blocks) const {
+ auto DomIt = Visited.find(&MBB);
+ if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
+ return true;
+
+ if (llvm::find(Blocks, &MBB) != Blocks.end())
+ return true;
+
+ return false;
+ }
+
+ void advanceLevel() {
+ MachineBasicBlock *VisitedDom;
+
+ if (!VisitedPostDom) {
+ VisitedPostDom = DefBlock;
+ VisitedDom = DefBlock;
+ Stack.push_back(DefBlock);
+ } else {
+ VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
+ VisitedDom = CommonDominators.back();
+
+ for (unsigned i = 0; i < NextLevel.size();) {
+ if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
+ Stack.push_back(NextLevel[i]);
+
+ NextLevel[i] = NextLevel.back();
+ NextLevel.pop_back();
+ } else {
+ i++;
+ }
+ }
+ }
+
+ unsigned Level = CommonDominators.size();
+ while (!Stack.empty()) {
+ MachineBasicBlock *MBB = Stack.pop_back_val();
+ if (!PDT.dominates(VisitedPostDom, MBB))
+ NextLevel.push_back(MBB);
+
+ Visited[MBB] = Level;
+ VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
+
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ == DefBlock) {
+ if (MBB == VisitedPostDom)
+ FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
+ else
+ FoundLoopLevel = std::min(FoundLoopLevel, Level);
+ continue;
+ }
+
+ if (Visited.try_emplace(Succ, ~0u).second) {
+ if (MBB == VisitedPostDom)
+ NextLevel.push_back(Succ);
+ else
+ Stack.push_back(Succ);
+ }
+ }
+ }
+
+ CommonDominators.push_back(VisitedDom);
+ }
};
} // End anonymous namespace.
-INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
- "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+ false)
char SILowerI1Copies::ID = 0;
@@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
return new SILowerI1Copies();
}
-bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+static unsigned createLaneMaskReg(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+}
+
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
+ MachineFunction &MF = *MBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+ unsigned UndefReg = createLaneMaskReg(MF);
+ BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+ UndefReg);
+ return UndefReg;
+}
- std::vector<unsigned> I1Defs;
+/// Lower all instructions that def or use vreg_1 registers.
+///
+/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
+/// occur around inline assembly. We do this first, before vreg_1 registers
+/// are changed to scalar mask registers.
+///
+/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
+/// all others, because phi lowering looks through copies and can therefore
+/// often make copy lowering unnecessary.
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+ MF = &TheMF;
+ MRI = &MF->getRegInfo();
+ DT = &getAnalysis<MachineDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ ST = &MF->getSubtarget<GCNSubtarget>();
+ TII = ST->getInstrInfo();
- MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
+ lowerCopiesFromI1();
+ lowerPhis();
+ lowerCopiesToI1();
- if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
- unsigned Reg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = MRI.getRegClass(Reg);
- if (RC == &AMDGPU::VReg_1RegClass)
- MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
- continue;
- }
+ for (unsigned Reg : ConstrainRegs)
+ MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+ ConstrainRegs.clear();
+ return true;
+}
+
+void SILowerI1Copies::lowerCopiesFromI1() {
+ SmallVector<MachineInstr *, 4> DeadCopies;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
if (MI.getOpcode() != AMDGPU::COPY)
continue;
- const MachineOperand &Dst = MI.getOperand(0);
- const MachineOperand &Src = MI.getOperand(1);
-
- if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
- !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
continue;
- const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
- const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+ if (isLaneMaskReg(DstReg) ||
+ (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+ MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+ continue;
+ // Copy into a 32-bit vector register.
+ LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
DebugLoc DL = MI.getDebugLoc();
- MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
- if (DstRC == &AMDGPU::VReg_1RegClass &&
- TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
- I1Defs.push_back(Dst.getReg());
- if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
- if (DefInst->getOperand(1).isImm()) {
- I1Defs.push_back(Dst.getReg());
+ assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+ assert(!MI.getOperand(0).getSubReg());
- int64_t Val = DefInst->getOperand(1).getImm();
- assert(Val == 0 || Val == -1);
+ ConstrainRegs.insert(SrcReg);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .addImm(-1)
+ .addReg(SrcReg);
+ DeadCopies.push_back(&MI);
+ }
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
- .add(Dst)
- .addImm(Val);
- MI.eraseFromParent();
- continue;
+ for (MachineInstr *MI : DeadCopies)
+ MI->eraseFromParent();
+ DeadCopies.clear();
+ }
+}
+
+void SILowerI1Copies::lowerPhis() {
+ MachineSSAUpdater SSAUpdater(*MF);
+ LoopFinder LF(*DT, *PDT);
+ PhiIncomingAnalysis PIA(*PDT);
+ SmallVector<MachineInstr *, 4> DeadPhis;
+ SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
+ SmallVector<unsigned, 4> IncomingRegs;
+ SmallVector<unsigned, 4> IncomingUpdated;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ LF.initialize(MBB);
+
+ for (MachineInstr &MI : MBB.phis()) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
+
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+
+ // Collect incoming values.
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ assert(i + 1 < MI.getNumOperands());
+ unsigned IncomingReg = MI.getOperand(i).getReg();
+ MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
+ MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+ if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+ IncomingReg = IncomingDef->getOperand(1).getReg();
+ assert(isLaneMaskReg(IncomingReg));
+ assert(!IncomingDef->getOperand(1).getSubReg());
+ } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+ continue;
+ } else {
+ assert(IncomingDef->isPHI());
+ }
+
+ IncomingBlocks.push_back(IncomingMBB);
+ IncomingRegs.push_back(IncomingReg);
+ }
+
+ // Phis in a loop that are observed outside the loop receive a simple but
+ // conservatively correct treatment.
+ MachineBasicBlock *PostDomBound = &MBB;
+ for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+ PostDomBound =
+ PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+ }
+
+ unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+
+ SSAUpdater.Initialize(DstReg);
+
+ if (FoundLoopLevel) {
+ LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ IncomingUpdated.push_back(createLaneMaskReg(*MF));
+ SSAUpdater.AddAvailableValue(IncomingBlocks[i],
+ IncomingUpdated.back());
+ }
+
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ buildMergeLaneMasks(
+ IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+ SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+ }
+ } else {
+ // The phi is not observed from outside a loop. Use a more accurate
+ // lowering.
+ PIA.analyze(MBB, IncomingBlocks);
+
+ for (MachineBasicBlock *MBB : PIA.predecessors())
+ SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ if (PIA.isSource(IMBB)) {
+ IncomingUpdated.push_back(0);
+ SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+ } else {
+ IncomingUpdated.push_back(createLaneMaskReg(*MF));
+ SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
}
}
- unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
- .add(Src);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
- .add(Dst)
- .addImm(0)
- .addImm(-1)
- .addReg(TmpSrc);
- MI.eraseFromParent();
- } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
- SrcRC == &AMDGPU::VReg_1RegClass) {
- if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
- DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
- DefInst->getOperand(1).getImm() == 0 &&
- DefInst->getOperand(2).getImm() != 0 &&
- DefInst->getOperand(3).isReg() &&
- TargetRegisterInfo::isVirtualRegister(
- DefInst->getOperand(3).getReg()) &&
- TRI->getCommonSubClass(
- MRI.getRegClass(DefInst->getOperand(3).getReg()),
- &AMDGPU::SGPR_64RegClass) &&
- AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
- .add(Dst)
- .addReg(AMDGPU::EXEC)
- .add(DefInst->getOperand(3));
- } else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
- .add(Dst)
- .add(Src)
- .addImm(0);
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ if (!IncomingUpdated[i])
+ continue;
+
+ MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ buildMergeLaneMasks(
+ IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+ SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
}
- MI.eraseFromParent();
}
+
+ unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+ if (NewReg != DstReg) {
+ MRI->replaceRegWith(NewReg, DstReg);
+
+ // Ensure that DstReg has a single def and mark the old PHI node for
+ // deletion.
+ MI.getOperand(0).setReg(NewReg);
+ DeadPhis.push_back(&MI);
+ }
+
+ IncomingBlocks.clear();
+ IncomingRegs.clear();
+ IncomingUpdated.clear();
}
+
+ for (MachineInstr *MI : DeadPhis)
+ MI->eraseFromParent();
+ DeadPhis.clear();
}
+}
+
+void SILowerI1Copies::lowerCopiesToI1() {
+ MachineSSAUpdater SSAUpdater(*MF);
+ LoopFinder LF(*DT, *PDT);
+ SmallVector<MachineInstr *, 4> DeadCopies;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ LF.initialize(MBB);
- for (unsigned Reg : I1Defs)
- MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
+ MI.getOpcode() != AMDGPU::COPY)
+ continue;
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
+ MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ continue;
+
+ if (MRI->use_empty(DstReg)) {
+ DeadCopies.push_back(&MI);
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
+
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+ continue;
+
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ assert(!MI.getOperand(1).getSubReg());
+
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ !isLaneMaskReg(SrcReg)) {
+ assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
+ unsigned TmpReg = createLaneMaskReg(*MF);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
+ .addReg(SrcReg)
+ .addImm(0);
+ MI.getOperand(1).setReg(TmpReg);
+ SrcReg = TmpReg;
+ }
+
+ // Defs in a loop that are observed outside the loop must be transformed
+ // into appropriate bit manipulation.
+ MachineBasicBlock *PostDomBound = &MBB;
+ for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+ PostDomBound =
+ PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+ }
+
+ unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+ if (FoundLoopLevel) {
+ SSAUpdater.Initialize(DstReg);
+ SSAUpdater.AddAvailableValue(&MBB, DstReg);
+ LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+
+ buildMergeLaneMasks(MBB, MI, DL, DstReg,
+ SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
+ DeadCopies.push_back(&MI);
+ }
+ }
+
+ for (MachineInstr *MI : DeadCopies)
+ MI->eraseFromParent();
+ DeadCopies.clear();
+ }
+}
+
+bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+ const MachineInstr *MI;
+ for (;;) {
+ MI = MRI->getUniqueVRegDef(Reg);
+ if (MI->getOpcode() != AMDGPU::COPY)
+ break;
+
+ Reg = MI->getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return false;
+ if (!isLaneMaskReg(Reg))
+ return false;
+ }
+
+ if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+ return false;
+
+ if (!MI->getOperand(1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(1).getImm();
+ if (Imm == 0) {
+ Val = false;
+ return true;
+ }
+ if (Imm == -1) {
+ Val = true;
+ return true;
+ }
return false;
}
+
+static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
+ Def = false;
+ Use = false;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
+ if (MO.isUse())
+ Use = true;
+ else
+ Def = true;
+ }
+ }
+}
+
+/// Return a point at the end of the given \p MBB to insert SALU instructions
+/// for lane mask calculation. Take terminators and SCC into account.
+MachineBasicBlock::iterator
+SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+ auto InsertionPt = MBB.getFirstTerminator();
+ bool TerminatorsUseSCC = false;
+ for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
+ bool DefsSCC;
+ instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
+ if (TerminatorsUseSCC || DefsSCC)
+ break;
+ }
+
+ if (!TerminatorsUseSCC)
+ return InsertionPt;
+
+ while (InsertionPt != MBB.begin()) {
+ InsertionPt--;
+
+ bool DefSCC, UseSCC;
+ instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
+ if (DefSCC)
+ return InsertionPt;
+ }
+
+ // We should have at least seen an IMPLICIT_DEF or COPY
+ llvm_unreachable("SCC used by terminator but no def in block");
+}
+
+void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ unsigned PrevReg, unsigned CurReg) {
+ bool PrevVal;
+ bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
+ bool CurVal;
+ bool CurConstant = isConstantLaneMask(CurReg, CurVal);
+
+ if (PrevConstant && CurConstant) {
+ if (PrevVal == CurVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
+ } else if (CurVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
+ .addReg(AMDGPU::EXEC)
+ .addImm(-1);
+ }
+ return;
+ }
+
+ unsigned PrevMaskedReg = 0;
+ unsigned CurMaskedReg = 0;
+ if (!PrevConstant) {
+ if (CurConstant && CurVal) {
+ PrevMaskedReg = PrevReg;
+ } else {
+ PrevMaskedReg = createLaneMaskReg(*MF);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+ .addReg(PrevReg)
+ .addReg(AMDGPU::EXEC);
+ }
+ }
+ if (!CurConstant) {
+ // TODO: check whether CurReg is already masked by EXEC
+ if (PrevConstant && PrevVal) {
+ CurMaskedReg = CurReg;
+ } else {
+ CurMaskedReg = createLaneMaskReg(*MF);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+ .addReg(CurReg)
+ .addReg(AMDGPU::EXEC);
+ }
+ }
+
+ if (PrevConstant && !PrevVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+ .addReg(CurMaskedReg);
+ } else if (CurConstant && !CurVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+ .addReg(PrevMaskedReg);
+ } else if (PrevConstant && PrevVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+ .addReg(CurMaskedReg)
+ .addReg(AMDGPU::EXEC);
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+ .addReg(PrevMaskedReg)
+ .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+ }
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 0d5ff75e37ed8..181cc41bd5ff7 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -117,7 +117,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- bool MaySpill = ST.isVGPRSpillingEnabled(F);
bool HasStackObjects = FrameInfo.hasStackObjects();
if (isEntryFunction()) {
@@ -126,21 +125,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (WorkItemIDZ)
WorkItemIDY = true;
- if (HasStackObjects || MaySpill) {
- PrivateSegmentWaveByteOffset = true;
+ PrivateSegmentWaveByteOffset = true;
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
(CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- ArgInfo.PrivateSegmentWaveByteOffset
- = ArgDescriptor::createRegister(AMDGPU::SGPR5);
- }
+ ArgInfo.PrivateSegmentWaveByteOffset =
+ ArgDescriptor::createRegister(AMDGPU::SGPR5);
}
- bool IsCOV2 = ST.isAmdCodeObjectV2(F);
- if (IsCOV2) {
- if (HasStackObjects || MaySpill)
- PrivateSegmentBuffer = true;
+ bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
+ if (isAmdHsaOrMesa) {
+ PrivateSegmentBuffer = true;
if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
DispatchPtr = true;
@@ -151,14 +147,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F.hasFnAttribute("amdgpu-dispatch-id"))
DispatchID = true;
} else if (ST.isMesaGfxShader(F)) {
- if (HasStackObjects || MaySpill)
- ImplicitBufferPtr = true;
+ ImplicitBufferPtr = true;
}
if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
- if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+ if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls that may require it before argument lowering.
if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 18754442898f7..fb7e670068fe6 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -471,7 +471,7 @@ void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
#ifndef NDEBUG
if (SuccSU->NumPredsLeft == 0) {
dbgs() << "*** Scheduling failed! ***\n";
- SuccSU->dump(DAG);
+ DAG->dumpNode(*SuccSU);
dbgs() << " has been released too many times!\n";
llvm_unreachable(nullptr);
}
@@ -611,13 +611,11 @@ void SIScheduleBlock::printDebug(bool full) {
dbgs() << "\nInstructions:\n";
if (!Scheduled) {
- for (SUnit* SU : SUnits) {
- SU->dump(DAG);
- }
+ for (const SUnit* SU : SUnits)
+ DAG->dumpNode(*SU);
} else {
- for (SUnit* SU : SUnits) {
- SU->dump(DAG);
- }
+ for (const SUnit* SU : SUnits)
+ DAG->dumpNode(*SU);
}
dbgs() << "///////////////////////\n";
@@ -1933,7 +1931,7 @@ void SIScheduleDAGMI::schedule()
LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
buildDAGWithRegPressure();
- LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
+ LLVM_DEBUG(dump());
topologicalSort();
findRootsAndBiasEdges(TopRoots, BotRoots);
@@ -1957,12 +1955,12 @@ void SIScheduleDAGMI::schedule()
for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
SUnit *SU = &SUnits[i];
- unsigned BaseLatReg;
+ MachineOperand *BaseLatOp;
int64_t OffLatReg;
if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
IsLowLatencySU[i] = 1;
- if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
- TRI))
+ if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
+ TRI))
LowLatencyOffset[i] = OffLatReg;
} else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
IsHighLatencySU[i] = 1;
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 938cdaf1ef8fb..b4a4e9e33133d 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -202,8 +202,6 @@ public:
class SIMemOpAccess final {
private:
-
- AMDGPUAS SIAddrSpaceInfo;
AMDGPUMachineModuleInfo *MMI = nullptr;
/// Reports unsupported message \p Msg for \p MI to LLVM context.
@@ -255,7 +253,7 @@ protected:
/// Instruction info.
const SIInstrInfo *TII = nullptr;
- IsaInfo::IsaVersion IV;
+ IsaVersion IV;
SICacheControl(const GCNSubtarget &ST);
@@ -453,22 +451,21 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
}
SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
- if (AS == SIAddrSpaceInfo.FLAT_ADDRESS)
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
return SIAtomicAddrSpace::FLAT;
- if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS)
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS)
return SIAtomicAddrSpace::GLOBAL;
- if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS)
+ if (AS == AMDGPUAS::LOCAL_ADDRESS)
return SIAtomicAddrSpace::LDS;
- if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS)
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS)
return SIAtomicAddrSpace::SCRATCH;
- if (AS == SIAddrSpaceInfo.REGION_ADDRESS)
+ if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
return SIAtomicAddrSpace::OTHER;
}
SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
- SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget());
MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
}
@@ -608,7 +605,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
SICacheControl::SICacheControl(const GCNSubtarget &ST) {
TII = ST.getInstrInfo();
- IV = IsaInfo::getIsaVersion(ST.getFeatureBits());
+ IV = getIsaVersion(ST.getCPU());
}
/* static */
@@ -815,6 +812,12 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MI->getDebugLoc();
+ const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
+
+ const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
+ ? AMDGPU::BUFFER_WBINVL1
+ : AMDGPU::BUFFER_WBINVL1_VOL;
+
if (Pos == Position::AFTER)
++MI;
@@ -822,7 +825,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL));
+ BuildMI(MBB, MI, DL, TII->get(Flush));
Changed = true;
break;
case SIAtomicScope::WORKGROUP:
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
new file mode 100644
index 0000000000000..883fd308f2f4b
--- /dev/null
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -0,0 +1,406 @@
+//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass inserts changes to the Mode register settings as required.
+/// Note that currently it only deals with the Double Precision Floating Point
+/// rounding mode setting, but is intended to be generic enough to be easily
+/// expanded.
+///
+//===----------------------------------------------------------------------===//
+//
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <queue>
+
+#define DEBUG_TYPE "si-mode-register"
+
+STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted.");
+
+using namespace llvm;
+
+struct Status {
+ // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a
+ // known value
+ unsigned Mask;
+ unsigned Mode;
+
+ Status() : Mask(0), Mode(0){};
+
+ Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) {
+ Mode &= Mask;
+ };
+
+ // merge two status values such that only values that don't conflict are
+ // preserved
+ Status merge(const Status &S) const {
+ return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask)));
+ }
+
+ // merge an unknown value by using the unknown value's mask to remove bits
+ // from the result
+ Status mergeUnknown(unsigned newMask) {
+ return Status(Mask & ~newMask, Mode & ~newMask);
+ }
+
+ // intersect two Status values to produce a mode and mask that is a subset
+ // of both values
+ Status intersect(const Status &S) const {
+ unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode);
+ unsigned NewMode = (Mode & NewMask);
+ return Status(NewMask, NewMode);
+ }
+
+ // produce the delta required to change the Mode to the required Mode
+ Status delta(const Status &S) const {
+ return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode);
+ }
+
+ bool operator==(const Status &S) const {
+ return (Mask == S.Mask) && (Mode == S.Mode);
+ }
+
+ bool operator!=(const Status &S) const { return !(*this == S); }
+
+ bool isCompatible(Status &S) {
+ return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode);
+ }
+
+ bool isCombinable(Status &S) {
+ return !(Mask & S.Mask) || isCompatible(S);
+ }
+};
+
+class BlockData {
+public:
+ // The Status that represents the mode register settings required by the
+ // FirstInsertionPoint (if any) in this block. Calculated in Phase 1.
+ Status Require;
+
+ // The Status that represents the net changes to the Mode register made by
+ // this block, Calculated in Phase 1.
+ Status Change;
+
+ // The Status that represents the mode register settings on exit from this
+ // block. Calculated in Phase 2.
+ Status Exit;
+
+ // The Status that represents the intersection of exit Mode register settings
+ // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3.
+ Status Pred;
+
+ // In Phase 1 we record the first instruction that has a mode requirement,
+ // which is used in Phase 3 if we need to insert a mode change.
+ MachineInstr *FirstInsertionPoint;
+
+ BlockData() : FirstInsertionPoint(nullptr) {};
+};
+
+namespace {
+
+class SIModeRegister : public MachineFunctionPass {
+public:
+ static char ID;
+
+ std::vector<std::unique_ptr<BlockData>> BlockInfo;
+ std::queue<MachineBasicBlock *> Phase2List;
+
+ // The default mode register setting currently only caters for the floating
+ // point double precision rounding mode.
+ // We currently assume the default rounding mode is Round to Nearest
+ // NOTE: this should come from a per function rounding mode setting once such
+ // a setting exists.
+ unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST;
+ Status DefaultStatus =
+ Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode));
+
+public:
+ SIModeRegister() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+ void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+ void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+ Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII);
+
+ void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I,
+ const SIInstrInfo *TII, Status InstrMode);
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE,
+ "Insert required mode register values", false, false)
+
+char SIModeRegister::ID = 0;
+
+char &llvm::SIModeRegisterID = SIModeRegister::ID;
+
+FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); }
+
+// Determine the Mode register setting required for this instruction.
+// Instructions which don't use the Mode register return a null Status.
+// Note this currently only deals with instructions that use the floating point
+// double precision setting.
+Status SIModeRegister::getInstructionMode(MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ if (TII->usesFPDPRounding(MI)) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_INTERP_P1LL_F16:
+ case AMDGPU::V_INTERP_P1LV_F16:
+ case AMDGPU::V_INTERP_P2_F16:
+ // f16 interpolation instructions need double precision round to zero
+ return Status(FP_ROUND_MODE_DP(3),
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO));
+ default:
+ return DefaultStatus;
+ }
+ }
+ return Status();
+}
+
+// Insert a setreg instruction to update the Mode register.
+// It is possible (though unlikely) for an instruction to require a change to
+// the value of disjoint parts of the Mode register when we don't know the
+// value of the intervening bits. In that case we need to use more than one
+// setreg instruction.
+void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
+ const SIInstrInfo *TII, Status InstrMode) {
+ while (InstrMode.Mask) {
+ unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
+ unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
+ unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+ BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(Value)
+ .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
+ (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+ ++NumSetregInserted;
+ InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
+ }
+}
+
+// In Phase 1 we iterate through the instructions of the block and for each
+// instruction we get its mode usage. If the instruction uses the Mode register
+// we:
+// - update the Change status, which tracks the changes to the Mode register
+// made by this block
+// - if this instruction's requirements are compatible with the current setting
+// of the Mode register we merge the modes
+// - if it isn't compatible and an InsertionPoint isn't set, then we set the
+// InsertionPoint to the current instruction, and we remember the current
+// mode
+// - if it isn't compatible and InsertionPoint is set we insert a seteg before
+// that instruction (unless this instruction forms part of the block's
+// entry requirements in which case the insertion is deferred until Phase 3
+// when predecessor exit values are known), and move the insertion point to
+// this instruction
+// - if this is a setreg instruction we treat it as an incompatible instruction.
+// This is sub-optimal but avoids some nasty corner cases, and is expected to
+// occur very rarely.
+// - on exit we have set the Require, Change, and initial Exit modes.
+void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
+ const SIInstrInfo *TII) {
+ auto NewInfo = llvm::make_unique<BlockData>();
+ MachineInstr *InsertionPoint = nullptr;
+ // RequirePending is used to indicate whether we are collecting the initial
+ // requirements for the block, and need to defer the first InsertionPoint to
+ // Phase 3. It is set to false once we have set FirstInsertionPoint, or when
+ // we discover an explict setreg that means this block doesn't have any
+ // initial requirements.
+ bool RequirePending = true;
+ Status IPChange;
+ for (MachineInstr &MI : MBB) {
+ Status InstrMode = getInstructionMode(MI, TII);
+ if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
+ (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+ // We preserve any explicit mode register setreg instruction we encounter,
+ // as we assume it has been inserted by a higher authority (this is
+ // likely to be a very rare occurrence).
+ unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+ if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
+ AMDGPU::Hwreg::ID_MODE)
+ continue;
+
+ unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
+ AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
+ 1;
+ unsigned Offset =
+ (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
+ unsigned Mask = ((1 << Width) - 1) << Offset;
+
+ // If an InsertionPoint is set we will insert a setreg there.
+ if (InsertionPoint) {
+ insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+ InsertionPoint = nullptr;
+ }
+ // If this is an immediate then we know the value being set, but if it is
+ // not an immediate then we treat the modified bits of the mode register
+ // as unknown.
+ if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+ unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm();
+ unsigned Mode = (Val << Offset) & Mask;
+ Status Setreg = Status(Mask, Mode);
+ // If we haven't already set the initial requirements for the block we
+ // don't need to as the requirements start from this explicit setreg.
+ RequirePending = false;
+ NewInfo->Change = NewInfo->Change.merge(Setreg);
+ } else {
+ NewInfo->Change = NewInfo->Change.mergeUnknown(Mask);
+ }
+ } else if (!NewInfo->Change.isCompatible(InstrMode)) {
+ // This instruction uses the Mode register and its requirements aren't
+ // compatible with the current mode.
+ if (InsertionPoint) {
+ // If the required mode change cannot be included in the current
+ // InsertionPoint changes, we need a setreg and start a new
+ // InsertionPoint.
+ if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) {
+ if (RequirePending) {
+ // This is the first insertionPoint in the block so we will defer
+ // the insertion of the setreg to Phase 3 where we know whether or
+ // not it is actually needed.
+ NewInfo->FirstInsertionPoint = InsertionPoint;
+ NewInfo->Require = NewInfo->Change;
+ RequirePending = false;
+ } else {
+ insertSetreg(MBB, InsertionPoint, TII,
+ IPChange.delta(NewInfo->Change));
+ IPChange = NewInfo->Change;
+ }
+ // Set the new InsertionPoint
+ InsertionPoint = &MI;
+ }
+ NewInfo->Change = NewInfo->Change.merge(InstrMode);
+ } else {
+ // No InsertionPoint is currently set - this is either the first in
+ // the block or we have previously seen an explicit setreg.
+ InsertionPoint = &MI;
+ IPChange = NewInfo->Change;
+ NewInfo->Change = NewInfo->Change.merge(InstrMode);
+ }
+ }
+ }
+ if (RequirePending) {
+ // If we haven't yet set the initial requirements for the block we set them
+ // now.
+ NewInfo->FirstInsertionPoint = InsertionPoint;
+ NewInfo->Require = NewInfo->Change;
+ } else if (InsertionPoint) {
+ // We need to insert a setreg at the InsertionPoint
+ insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+ }
+ NewInfo->Exit = NewInfo->Change;
+ BlockInfo[MBB.getNumber()] = std::move(NewInfo);
+}
+
+// In Phase 2 we revisit each block and calculate the common Mode register
+// value provided by all predecessor blocks. If the Exit value for the block
+// is changed, then we add the successor blocks to the worklist so that the
+// exit value is propagated.
+void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
+ const SIInstrInfo *TII) {
+// BlockData *BI = BlockInfo[MBB.getNumber()];
+ unsigned ThisBlock = MBB.getNumber();
+ if (MBB.pred_empty()) {
+ // There are no predecessors, so use the default starting status.
+ BlockInfo[ThisBlock]->Pred = DefaultStatus;
+ } else {
+ // Build a status that is common to all the predecessors by intersecting
+ // all the predecessor exit status values.
+ MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end();
+ MachineBasicBlock &PB = *(*P);
+ BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit;
+
+ for (P = std::next(P); P != E; P = std::next(P)) {
+ MachineBasicBlock *Pred = *P;
+ BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit);
+ }
+ }
+ Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
+ if (BlockInfo[ThisBlock]->Exit != TmpStatus) {
+ BlockInfo[ThisBlock]->Exit = TmpStatus;
+ // Add the successors to the work list so we can propagate the changed exit
+ // status.
+ for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+ E = MBB.succ_end();
+ S != E; S = std::next(S)) {
+ MachineBasicBlock &B = *(*S);
+ Phase2List.push(&B);
+ }
+ }
+}
+
+// In Phase 3 we revisit each block and if it has an insertion point defined we
+// check whether the predecessor mode meets the block's entry requirements. If
+// not we insert an appropriate setreg instruction to modify the Mode register.
+void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
+ const SIInstrInfo *TII) {
+// BlockData *BI = BlockInfo[MBB.getNumber()];
+ unsigned ThisBlock = MBB.getNumber();
+ if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) {
+ Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
+ if (BlockInfo[ThisBlock]->FirstInsertionPoint)
+ insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta);
+ else
+ insertSetreg(MBB, &MBB.instr_front(), TII, Delta);
+ }
+}
+
+bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
+ BlockInfo.resize(MF.getNumBlockIDs());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // Processing is performed in a number of phases
+
+ // Phase 1 - determine the initial mode required by each block, and add setreg
+ // instructions for intra block requirements.
+ for (MachineBasicBlock &BB : MF)
+ processBlockPhase1(BB, TII);
+
+ // Phase 2 - determine the exit mode from each block. We add all blocks to the
+ // list here, but will also add any that need to be revisited during Phase 2
+ // processing.
+ for (MachineBasicBlock &BB : MF)
+ Phase2List.push(&BB);
+ while (!Phase2List.empty()) {
+ processBlockPhase2(*Phase2List.front(), TII);
+ Phase2List.pop();
+ }
+
+ // Phase 3 - add an initial setreg to each block where the required entry mode
+ // is not satisfied by the exit mode of all its predecessors.
+ for (MachineBasicBlock &BB : MF)
+ processBlockPhase3(BB, TII);
+
+ BlockInfo.clear();
+
+ return NumSetregInserted > 0;
+}
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 7b678d12ba818..c671fed34bdf1 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -103,6 +103,122 @@ static MachineInstr* getOrExecSource(const MachineInstr &MI,
return SaveExecInst;
}
+// Optimize sequence
+// %sel = V_CNDMASK_B32_e64 0, 1, %cc
+// %cmp = V_CMP_NE_U32 1, %1
+// $vcc = S_AND_B64 $exec, %cmp
+// S_CBRANCH_VCC[N]Z
+// =>
+// $vcc = S_ANDN2_B64 $exec, %cc
+// S_CBRANCH_VCC[N]Z
+//
+// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
+// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
+// only 3 first instructions are really needed. S_AND_B64 with exec is a
+// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
+// lanes.
+//
+// Returns %cc register on success.
+static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
+ const GCNSubtarget &ST,
+ MachineRegisterInfo &MRI,
+ LiveIntervals *LIS) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const unsigned AndOpc = AMDGPU::S_AND_B64;
+ const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
+ const unsigned CondReg = AMDGPU::VCC;
+ const unsigned ExecReg = AMDGPU::EXEC;
+
+ auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return Opc == AMDGPU::S_CBRANCH_VCCZ ||
+ Opc == AMDGPU::S_CBRANCH_VCCNZ; });
+ if (I == MBB.terminators().end())
+ return AMDGPU::NoRegister;
+
+ auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
+ *I, MRI, LIS);
+ if (!And || And->getOpcode() != AndOpc ||
+ !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
+ return AMDGPU::NoRegister;
+
+ MachineOperand *AndCC = &And->getOperand(1);
+ unsigned CmpReg = AndCC->getReg();
+ unsigned CmpSubReg = AndCC->getSubReg();
+ if (CmpReg == ExecReg) {
+ AndCC = &And->getOperand(2);
+ CmpReg = AndCC->getReg();
+ CmpSubReg = AndCC->getSubReg();
+ } else if (And->getOperand(2).getReg() != ExecReg) {
+ return AMDGPU::NoRegister;
+ }
+
+ auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
+ if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
+ Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
+ Cmp->getParent() != And->getParent())
+ return AMDGPU::NoRegister;
+
+ MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
+ MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
+ if (Op1->isImm() && Op2->isReg())
+ std::swap(Op1, Op2);
+ if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
+ return AMDGPU::NoRegister;
+
+ unsigned SelReg = Op1->getReg();
+ auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
+ if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+ return AMDGPU::NoRegister;
+
+ Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
+ Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
+ MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
+ if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
+ Op1->getImm() != 0 || Op2->getImm() != 1)
+ return AMDGPU::NoRegister;
+
+ LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
+ << *Cmp << '\t' << *And);
+
+ unsigned CCReg = CC->getReg();
+ LIS->RemoveMachineInstrFromMaps(*And);
+ MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
+ TII->get(Andn2Opc), And->getOperand(0).getReg())
+ .addReg(ExecReg)
+ .addReg(CCReg, CC->getSubReg());
+ And->eraseFromParent();
+ LIS->InsertMachineInstrInMaps(*Andn2);
+
+ LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
+
+ // Try to remove compare. Cmp value should not used in between of cmp
+ // and s_and_b64 if VCC or just unused if any other register.
+ if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
+ MRI.use_nodbg_empty(CmpReg)) ||
+ (CmpReg == CondReg &&
+ std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
+ [&](const MachineInstr &MI) {
+ return MI.readsRegister(CondReg, TRI); }))) {
+ LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
+
+ LIS->RemoveMachineInstrFromMaps(*Cmp);
+ Cmp->eraseFromParent();
+
+ // Try to remove v_cndmask_b32.
+ if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
+ MRI.use_nodbg_empty(SelReg)) {
+ LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+
+ LIS->RemoveMachineInstrFromMaps(*Sel);
+ Sel->eraseFromParent();
+ }
+ }
+
+ return CCReg;
+}
+
bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -117,9 +233,24 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
+ if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
+ RecalcRegs.insert(Reg);
+ RecalcRegs.insert(AMDGPU::VCC_LO);
+ RecalcRegs.insert(AMDGPU::VCC_HI);
+ RecalcRegs.insert(AMDGPU::SCC);
+ Changed = true;
+ }
+
// Try to remove unneeded instructions before s_endpgm.
if (MBB.succ_empty()) {
- if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM)
+ if (MBB.empty())
+ continue;
+
+ // Skip this if the endpgm has any implicit uses, otherwise we would need
+ // to be careful to update / remove them.
+ MachineInstr &Term = MBB.back();
+ if (Term.getOpcode() != AMDGPU::S_ENDPGM ||
+ Term.getNumOperands() != 0)
continue;
SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 0e000b72962eb..2d43d5d05ef64 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -90,7 +90,9 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const;
+ bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
+ void pseudoOpConvertToVOP2(MachineInstr &MI,
+ const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
@@ -854,7 +856,82 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
}
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
+// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
+// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
+//
+// We are transforming from a VOP3 into a VOP2 form of the instruction.
+// %19:vgpr_32 = V_AND_B32_e32 255,
+// killed %16:vgpr_32, implicit $exec
+// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
+// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
+// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
+// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
+//
+// becomes
+// %47:vgpr_32 = V_ADD_I32_sdwa
+// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
+// implicit-def $vcc, implicit $exec
+// %48:vgpr_32 = V_ADDC_U32_e32
+// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
+void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
+ const GCNSubtarget &ST) const {
+ int Opc = MI.getOpcode();
+ assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) &&
+ "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
+
+ // Can the candidate MI be shrunk?
+ if (!TII->canShrink(MI, *MRI))
+ return;
+ Opc = AMDGPU::getVOPe32(Opc);
+ // Find the related ADD instruction.
+ const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ if (!Sdst)
+ return;
+ MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
+ if (!NextOp)
+ return;
+ MachineInstr &MISucc = *NextOp->getParent();
+ // Can the successor be shrunk?
+ if (!TII->canShrink(MISucc, *MRI))
+ return;
+ int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
+ // Make sure the carry in/out are subsequently unused.
+ MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
+ if (!CarryIn)
+ return;
+ MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
+ if (!CarryOut)
+ return;
+ if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
+ return;
+ // Make sure VCC or its subregs are dead before MI.
+ MachineBasicBlock &MBB = *MI.getParent();
+ auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
+ if (Liveness != MachineBasicBlock::LQR_Dead)
+ return;
+ // Check if VCC is referenced in range of (MI,MISucc].
+ for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
+ I != E; ++I) {
+ if (I->modifiesRegister(AMDGPU::VCC, TRI))
+ return;
+ }
+ // Make the two new e32 instruction variants.
+ // Replace MI with V_{SUB|ADD}_I32_e32
+ auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
+ NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
+ NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+ NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+ MI.eraseFromParent();
+ // Replace MISucc with V_{SUBB|ADDC}_U32_e32
+ auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
+ NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
+ NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
+ NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+ MISucc.eraseFromParent();
+}
+
+bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST) const {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
@@ -1127,6 +1204,22 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
bool Changed = false;
do {
+ // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
+ // Look for a possible ADD or SUB that resulted from a previously lowered
+ // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
+ // lowers the pair of instructions into e32 form.
+ matchSDWAOperands(MBB);
+ for (const auto &OperandPair : SDWAOperands) {
+ const auto &Operand = OperandPair.second;
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ if (PotentialMI &&
+ (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 ||
+ PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64))
+ pseudoOpConvertToVOP2(*PotentialMI, ST);
+ }
+ SDWAOperands.clear();
+
+ // Generate potential match list.
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 624607f6ea542..97cfde2b23541 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -18,9 +18,12 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@@ -495,15 +498,16 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .add(*Reg)
- .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MachineInstrBuilder NewMI =
+ BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ .add(*Reg)
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .cloneMemRefs(*MI);
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata_in);
@@ -900,7 +904,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
.addImm(0) // glc
.addMemOperand(MMO);
- if (NumSubRegs > 1)
+ if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
continue;
@@ -914,7 +918,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
.addReg(Spill.VGPR)
.addImm(Spill.Lane);
- if (NumSubRegs > 1)
+ if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
} else {
if (OnlyToVGPR)
@@ -1598,3 +1602,57 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
llvm_unreachable("not implemented");
}
}
+
+// Find reaching register definition
+MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
+ MachineInstr &Use,
+ MachineRegisterInfo &MRI,
+ LiveIntervals *LIS) const {
+ auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
+ SlotIndex UseIdx = LIS->getInstructionIndex(Use);
+ SlotIndex DefIdx;
+
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!LIS->hasInterval(Reg))
+ return nullptr;
+ LiveInterval &LI = LIS->getInterval(Reg);
+ LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
+ : MRI.getMaxLaneMaskForVReg(Reg);
+ VNInfo *V = nullptr;
+ if (LI.hasSubRanges()) {
+ for (auto &S : LI.subranges()) {
+ if ((S.LaneMask & SubLanes) == SubLanes) {
+ V = S.getVNInfoAt(UseIdx);
+ break;
+ }
+ }
+ } else {
+ V = LI.getVNInfoAt(UseIdx);
+ }
+ if (!V)
+ return nullptr;
+ DefIdx = V->def;
+ } else {
+ // Find last def.
+ for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
+ LiveRange &LR = LIS->getRegUnit(*Units);
+ if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
+ if (!DefIdx.isValid() ||
+ MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
+ LIS->getInstructionFromIndex(V->def)))
+ DefIdx = V->def;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
+
+ if (!Def || !MDT.dominates(Def, &Use))
+ return nullptr;
+
+ assert(Def->modifiesRegister(Reg, this));
+
+ return Def;
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 5a51b67ca719c..b82fefde47e13 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -228,6 +228,12 @@ public:
getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const override;
+ // Find reaching register definition
+ MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
+ MachineInstr &Use,
+ MachineRegisterInfo &MRI,
+ LiveIntervals *LIS) const;
+
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index f87a0763b353b..c625ecc9b750e 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -435,7 +435,7 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
let AllocationPriority = 7;
}
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
@@ -444,13 +444,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add
let isAllocatable = 0;
}
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 8;
@@ -459,15 +459,15 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
// Requires 2 s_mov_b64 to copy
let CopyCost = 2 in {
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> {
let AllocationPriority = 10;
}
-def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> {
let isAllocatable = 0;
}
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32,
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
(add SGPR_128, TTMP_128)> {
let AllocationPriority = 10;
}
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4189bcce52ea1..6ad7dd0e3a7ce 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -64,59 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() {
return new SIShrinkInstructions();
}
-static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
- const SIRegisterInfo &TRI,
- const MachineRegisterInfo &MRI) {
-
- const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- // Can't shrink instruction with three operands.
- // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
- // a special case for it. It can only be shrunk if the third operand
- // is vcc. We should handle this the same way we handle vopc, by addding
- // a register allocation hint pre-regalloc and then do the shrinking
- // post-regalloc.
- if (Src2) {
- switch (MI.getOpcode()) {
- default: return false;
-
- case AMDGPU::V_ADDC_U32_e64:
- case AMDGPU::V_SUBB_U32_e64:
- case AMDGPU::V_SUBBREV_U32_e64: {
- const MachineOperand *Src1
- = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()))
- return false;
- // Additional verification is needed for sdst/src2.
- return true;
- }
- case AMDGPU::V_MAC_F32_e64:
- case AMDGPU::V_MAC_F16_e64:
- case AMDGPU::V_FMAC_F32_e64:
- if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) ||
- TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
- return false;
- break;
-
- case AMDGPU::V_CNDMASK_B32_e64:
- break;
- }
- }
-
- const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) ||
- TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
- return false;
-
- // We don't need to check src0, all input types are legal, so just make sure
- // src0 isn't using any modifiers.
- if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
- return false;
-
- // Check output modifiers
- return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
- !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
-}
-
/// This function checks \p MI for operands defined by a move immediate
/// instruction and then folds the literal constant into the instruction if it
/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
@@ -173,19 +120,6 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
return false;
}
-// Copy MachineOperand with all flags except setting it as implicit.
-static void copyFlagsToImplicitVCC(MachineInstr &MI,
- const MachineOperand &Orig) {
-
- for (MachineOperand &Use : MI.implicit_operands()) {
- if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
- Use.setIsUndef(Orig.isUndef());
- Use.setIsKill(Orig.isKill());
- return;
- }
- }
-}
-
static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
return isInt<16>(Src.getImm()) &&
!TII->isInlineConstant(*Src.getParent(),
@@ -278,6 +212,245 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
}
}
+/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
+/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
+/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
+/// XNOR (as a ^ b == ~(a ^ ~b)).
+/// \returns true if the caller should continue the machine function iterator
+static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
+ MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII,
+ MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+ MachineOperand *SrcReg = Src0;
+ MachineOperand *SrcImm = Src1;
+
+ if (SrcImm->isImm() &&
+ !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
+ uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+ uint32_t NewImm = 0;
+
+ if (Opc == AMDGPU::S_AND_B32) {
+ if (isPowerOf2_32(~Imm)) {
+ NewImm = countTrailingOnes(Imm);
+ Opc = AMDGPU::S_BITSET0_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ANDN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_OR_B32) {
+ if (isPowerOf2_32(Imm)) {
+ NewImm = countTrailingZeros(Imm);
+ Opc = AMDGPU::S_BITSET1_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ORN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_XOR_B32) {
+ if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_XNOR_B32;
+ }
+ } else {
+ llvm_unreachable("unexpected opcode");
+ }
+
+ if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+ SrcImm == Src0) {
+ if (!TII->commuteInstruction(MI, false, 1, 2))
+ NewImm = 0;
+ }
+
+ if (NewImm != 0) {
+ if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+ SrcReg->isReg()) {
+ MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+ MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ return true;
+ }
+
+ if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+ MI.setDesc(TII->get(Opc));
+ if (Opc == AMDGPU::S_BITSET0_B32 ||
+ Opc == AMDGPU::S_BITSET1_B32) {
+ Src0->ChangeToImmediate(NewImm);
+ MI.RemoveOperand(2);
+ } else {
+ SrcImm->setImm(NewImm);
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+// This is the same as MachineInstr::readsRegister/modifiesRegister except
+// it takes subregs into account.
+static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+ unsigned Reg, unsigned SubReg,
+ const SIRegisterInfo &TRI) {
+ for (const MachineOperand &MO : R) {
+ if (!MO.isReg())
+ continue;
+
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (TRI.regsOverlap(Reg, MO.getReg()))
+ return true;
+ } else if (MO.getReg() == Reg &&
+ TargetRegisterInfo::isVirtualRegister(Reg)) {
+ LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
+ TRI.getSubRegIndexLaneMask(MO.getSubReg());
+ if (Overlap.any())
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool instReadsReg(const MachineInstr *MI,
+ unsigned Reg, unsigned SubReg,
+ const SIRegisterInfo &TRI) {
+ return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+}
+
+static bool instModifiesReg(const MachineInstr *MI,
+ unsigned Reg, unsigned SubReg,
+ const SIRegisterInfo &TRI) {
+ return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+}
+
+static TargetInstrInfo::RegSubRegPair
+getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
+ const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
+ if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+ } else {
+ LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
+ Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
+ }
+ }
+ return TargetInstrInfo::RegSubRegPair(Reg, Sub);
+}
+
+// Match:
+// mov t, x
+// mov x, y
+// mov y, t
+//
+// =>
+//
+// mov t, x (t is potentially dead and move eliminated)
+// v_swap_b32 x, y
+//
+// Returns next valid instruction pointer if was able to create v_swap_b32.
+//
+// This shall not be done too early not to prevent possible folding which may
+// remove matched moves, and this should prefereably be done before RA to
+// release saved registers and also possibly after RA which can insert copies
+// too.
+//
+// This is really just a generic peephole that is not a canocical shrinking,
+// although requirements match the pass placement and it reduces code size too.
+static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII) {
+ assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MovT.getOpcode() == AMDGPU::COPY);
+
+ unsigned T = MovT.getOperand(0).getReg();
+ unsigned Tsub = MovT.getOperand(0).getSubReg();
+ MachineOperand &Xop = MovT.getOperand(1);
+
+ if (!Xop.isReg())
+ return nullptr;
+ unsigned X = Xop.getReg();
+ unsigned Xsub = Xop.getSubReg();
+
+ unsigned Size = TII->getOpSize(MovT, 0) / 4;
+
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ if (!TRI.isVGPR(MRI, X))
+ return nullptr;
+
+ for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
+ if (YTop.getSubReg() != Tsub)
+ continue;
+
+ MachineInstr &MovY = *YTop.getParent();
+ if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ MovY.getOpcode() != AMDGPU::COPY) ||
+ MovY.getOperand(1).getSubReg() != Tsub)
+ continue;
+
+ unsigned Y = MovY.getOperand(0).getReg();
+ unsigned Ysub = MovY.getOperand(0).getSubReg();
+
+ if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
+ continue;
+
+ MachineInstr *MovX = nullptr;
+ auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
+ for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
+ if (instReadsReg(&*I, X, Xsub, TRI) ||
+ instModifiesReg(&*I, Y, Ysub, TRI) ||
+ instModifiesReg(&*I, T, Tsub, TRI) ||
+ (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+ MovX = nullptr;
+ break;
+ }
+ if (!instReadsReg(&*I, Y, Ysub, TRI)) {
+ if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+ MovX = nullptr;
+ break;
+ }
+ continue;
+ }
+ if (MovX ||
+ (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ I->getOpcode() != AMDGPU::COPY) ||
+ I->getOperand(0).getReg() != X ||
+ I->getOperand(0).getSubReg() != Xsub) {
+ MovX = nullptr;
+ break;
+ }
+ MovX = &*I;
+ }
+
+ if (!MovX || I == E)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
+
+ for (unsigned I = 0; I < Size; ++I) {
+ TargetInstrInfo::RegSubRegPair X1, Y1;
+ X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
+ Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+ BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
+ TII->get(AMDGPU::V_SWAP_B32))
+ .addDef(X1.Reg, 0, X1.SubReg)
+ .addDef(Y1.Reg, 0, Y1.SubReg)
+ .addReg(Y1.Reg, 0, Y1.SubReg)
+ .addReg(X1.Reg, 0, X1.SubReg).getInstr();
+ }
+ MovX->eraseFromParent();
+ MovY.eraseFromParent();
+ MachineInstr *Next = &*std::next(MovT.getIterator());
+ if (MRI.use_nodbg_empty(T))
+ MovT.eraseFromParent();
+ else
+ Xop.setIsKill(false);
+
+ return Next;
+ }
+
+ return nullptr;
+}
+
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -285,7 +458,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
std::vector<unsigned> I1Defs;
@@ -319,6 +491,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
}
+ if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MI.getOpcode() == AMDGPU::COPY)) {
+ if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+ Next = NextMI->getIterator();
+ continue;
+ }
+ }
+
// Combine adjacent s_nops to use the immediate operand encoding how long
// to wait.
//
@@ -408,14 +588,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ // Shrink scalar logic operations.
+ if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+ MI.getOpcode() == AMDGPU::S_OR_B32 ||
+ MI.getOpcode() == AMDGPU::S_XOR_B32) {
+ if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
- if (!canShrink(MI, TII, TRI, MRI)) {
+ if (!TII->canShrink(MI, MRI)) {
// Try commuting the instruction and see if that enables us to shrink
// it.
if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
- !canShrink(MI, TII, TRI, MRI))
+ !TII->canShrink(MI, MRI))
continue;
}
@@ -488,40 +676,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// We can shrink this instruction
LLVM_DEBUG(dbgs() << "Shrinking " << MI);
- MachineInstrBuilder Inst32 =
- BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
-
- // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
- // For VOPC instructions, this is replaced by an implicit def of vcc.
- int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
- if (Op32DstIdx != -1) {
- // dst
- Inst32.add(MI.getOperand(0));
- } else {
- assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
- "Unexpected case");
- }
-
-
- Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
-
- const MachineOperand *Src1 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Src1)
- Inst32.add(*Src1);
-
- if (Src2) {
- int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
- if (Op32Src2Idx != -1) {
- Inst32.add(*Src2);
- } else {
- // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
- // replaced with an implicit read of vcc. This was already added
- // during the initial BuildMI, so find it to preserve the flags.
- copyFlagsToImplicitVCC(*Inst32, *Src2);
- }
- }
-
+ MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
++NumInstructionsShrunk;
// Copy extra operands not present in the instruction definition.
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 7485326017b26..8a063e1a48673 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -375,83 +375,6 @@ defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
}
//===----------------------------------------------------------------------===//
-// Scalar Memory Patterns
-//===----------------------------------------------------------------------===//
-
-
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
- auto Ld = cast<LoadSDNode>(N);
- return Ld->getAlignment() >= 4 &&
- ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) ||
- (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
- !Ld->isVolatile() && !N->isDivergent() &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
-}]>;
-
-def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
-def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
-def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
-def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
-
-multiclass SMRD_Pattern <string Instr, ValueType vt> {
-
- // 1. IMM offset
- def : GCNPat <
- (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
- >;
-
- // 2. SGPR offset
- def : GCNPat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
- >;
-}
-
-let OtherPredicates = [isSICI] in {
-def : GCNPat <
- (i64 (readcyclecounter)),
- (S_MEMTIME)
->;
-}
-
-// Global and constant loads can be selected to either MUBUF or SMRD
-// instructions, but SMRD instructions are faster so we want the instruction
-// selector to prefer those.
-let AddedComplexity = 100 in {
-
-defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-
-// 1. Offset as an immediate
-def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI
- (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : GCNPat <
- (SIload_constant v4i32:$sbase, i32:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
->;
-
-} // End let AddedComplexity = 100
-
-let OtherPredicates = [isVI] in {
-
-def : GCNPat <
- (i64 (readcyclecounter)),
- (S_MEMREALTIME)
->;
-
-} // let OtherPredicates = [isVI]
-
-
-//===----------------------------------------------------------------------===//
// Targets
//===----------------------------------------------------------------------===//
@@ -757,25 +680,97 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
-let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+//===----------------------------------------------------------------------===//
+// Scalar Memory Patterns
+//===----------------------------------------------------------------------===//
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>;
+
+def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : GCNPat <
+ (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+ >;
+
+ // 2. 32-bit IMM offset on CI
+ def : GCNPat <
+ (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+ (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+ let OtherPredicates = [isCIOnly];
+ }
+
+ // 3. SGPR offset
+ def : GCNPat <
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+ >;
+}
+
+multiclass SMLoad_Pattern <string Instr, ValueType vt> {
+ // 1. Offset as an immediate
+ def : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
+ >;
+
+ // 2. 32-bit IMM offset on CI
+ def : GCNPat <
+ (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
+ (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> {
+ let OtherPredicates = [isCIOnly];
+ }
-class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
- (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
- (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
- let OtherPredicates = [isCIOnly];
+ // 3. Offset loaded in an 32bit SGPR
+ def : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
+ >;
}
-def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX2", v2i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
-def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
+
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
+
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
+} // End let AddedComplexity = 100
+let OtherPredicates = [isSICI] in {
def : GCNPat <
- (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
- let OtherPredicates = [isCI]; // should this be isCIOnly?
+ (i64 (readcyclecounter)),
+ (S_MEMTIME)
+>;
}
-} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
+let OtherPredicates = [isVI] in {
+def : GCNPat <
+ (i64 (readcyclecounter)),
+ (S_MEMREALTIME)
+>;
+
+} // let OtherPredicates = [isVI]
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 6f5db9644c868..ca5e981ac5c25 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -336,42 +336,54 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
"$sdst, $src0, $src1", pattern
>;
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return !N->isDivergent(); }]
+>;
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0, node:$src1),
+ (Op $src0, $src1),
+ [{ return !N->isDivergent(); }]
+>;
+
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
def S_ADD_U32 : SOP2_32 <"s_add_u32">;
def S_ADD_I32 : SOP2_32 <"s_add_i32",
- [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
>;
} // End isCommutable = 1
def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
def S_SUB_I32 : SOP2_32 <"s_sub_i32",
- [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<sub> SSrc_b32:$src0, SSrc_b32:$src1))]
>;
let Uses = [SCC] in { // Carry in comes from SCC
let isCommutable = 1 in {
def S_ADDC_U32 : SOP2_32 <"s_addc_u32",
- [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+ [(set i32:$sdst, (UniformBinFrag<adde> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
} // End isCommutable = 1
def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
- [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+ [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
} // End Uses = [SCC]
let isCommutable = 1 in {
def S_MIN_I32 : SOP2_32 <"s_min_i32",
- [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
>;
def S_MIN_U32 : SOP2_32 <"s_min_u32",
- [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
>;
def S_MAX_I32 : SOP2_32 <"s_max_i32",
- [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
>;
def S_MAX_U32 : SOP2_32 <"s_max_u32",
- [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
>;
} // End isCommutable = 1
} // End Defs = [SCC]
@@ -385,27 +397,27 @@ let Uses = [SCC] in {
let Defs = [SCC] in {
let isCommutable = 1 in {
def S_AND_B32 : SOP2_32 <"s_and_b32",
- [(set i32:$sdst, (and i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, i32:$src1))]
>;
def S_AND_B64 : SOP2_64 <"s_and_b64",
- [(set i64:$sdst, (and i64:$src0, i64:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, i64:$src1))]
>;
def S_OR_B32 : SOP2_32 <"s_or_b32",
- [(set i32:$sdst, (or i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, i32:$src1))]
>;
def S_OR_B64 : SOP2_64 <"s_or_b64",
- [(set i64:$sdst, (or i64:$src0, i64:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, i64:$src1))]
>;
def S_XOR_B32 : SOP2_32 <"s_xor_b32",
- [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<xor> i32:$src0, i32:$src1))]
>;
def S_XOR_B64 : SOP2_64 <"s_xor_b64",
- [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<xor> i64:$src0, i64:$src1))]
>;
def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
@@ -415,45 +427,71 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
[(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
>;
+
+def S_NAND_B32 : SOP2_32 <"s_nand_b32",
+ [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NAND_B64 : SOP2_64 <"s_nand_b64",
+ [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))]
+>;
+
+def S_NOR_B32 : SOP2_32 <"s_nor_b32",
+ [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NOR_B64 : SOP2_64 <"s_nor_b64",
+ [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))]
+>;
} // End isCommutable = 1
-def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
-def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
-def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
-def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
-def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
-def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
-def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
-def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
+ [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
+ [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
+
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
+ [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
+ [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
} // End Defs = [SCC]
// Use added complexity so these patterns are preferred to the VALU patterns.
let AddedComplexity = 1 in {
let Defs = [SCC] in {
+// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
- [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))]
>;
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
- [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<shl> i64:$src0, i32:$src1))]
>;
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
- [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))]
>;
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
- [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<srl> i64:$src0, i32:$src1))]
>;
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
- [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))]
>;
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
- [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
+ [(set i64:$sdst, (UniformBinFrag<sra> i64:$src0, i32:$src1))]
>;
} // End Defs = [SCC]
def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
- [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+ [(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
+
+// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change
def S_MUL_I32 : SOP2_32 <"s_mul_i32",
[(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
let isCommutable = 1;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4eba193823154..54c866bdc63ce 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -128,131 +128,127 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
return NewInfo ? NewInfo->Opcode : -1;
}
-// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
-// header files, so we need to wrap it in a function that takes unsigned
-// instead.
-int getMCOpcode(uint16_t Opcode, unsigned Gen) {
- return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
+struct MUBUFInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t dwords;
+ bool has_vaddr;
+ bool has_srsrc;
+ bool has_soffset;
+};
+
+#define GET_MUBUFInfoTable_DECL
+#define GET_MUBUFInfoTable_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+
+int getMUBUFBaseOpcode(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
+ return Info ? Info->BaseOpcode : -1;
}
-namespace IsaInfo {
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
+ const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+ return Info ? Info->Opcode : -1;
+}
-IsaVersion getIsaVersion(const FeatureBitset &Features) {
- // GCN GFX6 (Southern Islands (SI)).
- if (Features.test(FeatureISAVersion6_0_0))
- return {6, 0, 0};
- if (Features.test(FeatureISAVersion6_0_1))
- return {6, 0, 1};
+int getMUBUFDwords(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->dwords : 0;
+}
- // GCN GFX7 (Sea Islands (CI)).
- if (Features.test(FeatureISAVersion7_0_0))
- return {7, 0, 0};
- if (Features.test(FeatureISAVersion7_0_1))
- return {7, 0, 1};
- if (Features.test(FeatureISAVersion7_0_2))
- return {7, 0, 2};
- if (Features.test(FeatureISAVersion7_0_3))
- return {7, 0, 3};
- if (Features.test(FeatureISAVersion7_0_4))
- return {7, 0, 4};
- if (Features.test(FeatureSeaIslands))
- return {7, 0, 0};
+bool getMUBUFHasVAddr(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->has_vaddr : false;
+}
- // GCN GFX8 (Volcanic Islands (VI)).
- if (Features.test(FeatureISAVersion8_0_1))
- return {8, 0, 1};
- if (Features.test(FeatureISAVersion8_0_2))
- return {8, 0, 2};
- if (Features.test(FeatureISAVersion8_0_3))
- return {8, 0, 3};
- if (Features.test(FeatureISAVersion8_1_0))
- return {8, 1, 0};
- if (Features.test(FeatureVolcanicIslands))
- return {8, 0, 0};
+bool getMUBUFHasSrsrc(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->has_srsrc : false;
+}
- // GCN GFX9.
- if (Features.test(FeatureISAVersion9_0_0))
- return {9, 0, 0};
- if (Features.test(FeatureISAVersion9_0_2))
- return {9, 0, 2};
- if (Features.test(FeatureISAVersion9_0_4))
- return {9, 0, 4};
- if (Features.test(FeatureISAVersion9_0_6))
- return {9, 0, 6};
- if (Features.test(FeatureGFX9))
- return {9, 0, 0};
+bool getMUBUFHasSoffset(unsigned Opc) {
+ const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
+ return Info ? Info->has_soffset : false;
+}
- if (Features.test(FeatureSouthernIslands))
- return {0, 0, 0};
- return {7, 0, 0};
+// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+ return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}
+namespace IsaInfo {
+
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
auto TargetTriple = STI->getTargetTriple();
- auto ISAVersion = IsaInfo::getIsaVersion(STI->getFeatureBits());
+ auto Version = getIsaVersion(STI->getCPU());
Stream << TargetTriple.getArchName() << '-'
<< TargetTriple.getVendorName() << '-'
<< TargetTriple.getOSName() << '-'
<< TargetTriple.getEnvironmentName() << '-'
<< "gfx"
- << ISAVersion.Major
- << ISAVersion.Minor
- << ISAVersion.Stepping;
+ << Version.Major
+ << Version.Minor
+ << Version.Stepping;
if (hasXNACK(*STI))
Stream << "+xnack";
+ if (hasSRAMECC(*STI))
+ Stream << "+sram-ecc";
Stream.flush();
}
bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
- return STI->getFeatureBits().test(FeatureCodeObjectV3);
+ return STI->getTargetTriple().getOS() == Triple::AMDHSA &&
+ STI->getFeatureBits().test(FeatureCodeObjectV3);
}
-unsigned getWavefrontSize(const FeatureBitset &Features) {
- if (Features.test(FeatureWavefrontSize16))
+unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureWavefrontSize16))
return 16;
- if (Features.test(FeatureWavefrontSize32))
+ if (STI->getFeatureBits().test(FeatureWavefrontSize32))
return 32;
return 64;
}
-unsigned getLocalMemorySize(const FeatureBitset &Features) {
- if (Features.test(FeatureLocalMemorySize32768))
+unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
return 32768;
- if (Features.test(FeatureLocalMemorySize65536))
+ if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
return 65536;
return 0;
}
-unsigned getEUsPerCU(const FeatureBitset &Features) {
+unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
return 4;
}
-unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- if (!Features.test(FeatureGCN))
+ if (!STI->getFeatureBits().test(FeatureGCN))
return 8;
- unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+ unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
if (N == 1)
return 40;
N = 40 / N;
return std::min(N, 16u);
}
-unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
- return getMaxWavesPerEU() * getEUsPerCU(Features);
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
+ return getMaxWavesPerEU() * getEUsPerCU(STI);
}
-unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- return getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+ return getWavesPerWorkGroup(STI, FlatWorkGroupSize);
}
-unsigned getMinWavesPerEU(const FeatureBitset &Features) {
+unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
return 1;
}
@@ -261,89 +257,89 @@ unsigned getMaxWavesPerEU() {
return 10;
}
-unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize),
- getEUsPerCU(Features)) / getEUsPerCU(Features);
+ return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize),
+ getEUsPerCU(STI)) / getEUsPerCU(STI);
}
-unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) {
+unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) {
return 1;
}
-unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) {
+unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
return 2048;
}
-unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) /
- getWavefrontSize(Features);
+ return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) /
+ getWavefrontSize(STI);
}
-unsigned getSGPRAllocGranule(const FeatureBitset &Features) {
- IsaVersion Version = getIsaVersion(Features);
+unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
+ IsaVersion Version = getIsaVersion(STI->getCPU());
if (Version.Major >= 8)
return 16;
return 8;
}
-unsigned getSGPREncodingGranule(const FeatureBitset &Features) {
+unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI) {
return 8;
}
-unsigned getTotalNumSGPRs(const FeatureBitset &Features) {
- IsaVersion Version = getIsaVersion(Features);
+unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI) {
+ IsaVersion Version = getIsaVersion(STI->getCPU());
if (Version.Major >= 8)
return 800;
return 512;
}
-unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
- if (Features.test(FeatureSGPRInitBug))
+unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
+ if (STI->getFeatureBits().test(FeatureSGPRInitBug))
return FIXED_NUM_SGPRS_FOR_INIT_BUG;
- IsaVersion Version = getIsaVersion(Features);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
if (Version.Major >= 8)
return 102;
return 104;
}
-unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
if (WavesPerEU >= getMaxWavesPerEU())
return 0;
- unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1);
- if (Features.test(FeatureTrapHandler))
+ unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
+ if (STI->getFeatureBits().test(FeatureTrapHandler))
MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
- MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1;
- return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
+ MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(STI)) + 1;
+ return std::min(MinNumSGPRs, getAddressableNumSGPRs(STI));
}
-unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
bool Addressable) {
assert(WavesPerEU != 0);
- IsaVersion Version = getIsaVersion(Features);
- unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
+ unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI);
if (Version.Major >= 8 && !Addressable)
AddressableNumSGPRs = 112;
- unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU;
- if (Features.test(FeatureTrapHandler))
+ unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU;
+ if (STI->getFeatureBits().test(FeatureTrapHandler))
MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
- MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features));
+ MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(STI));
return std::min(MaxNumSGPRs, AddressableNumSGPRs);
}
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
bool FlatScrUsed, bool XNACKUsed) {
unsigned ExtraSGPRs = 0;
if (VCCUsed)
ExtraSGPRs = 2;
- IsaVersion Version = getIsaVersion(Features);
+ IsaVersion Version = getIsaVersion(STI->getCPU());
if (Version.Major < 8) {
if (FlatScrUsed)
ExtraSGPRs = 4;
@@ -358,74 +354,74 @@ unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
return ExtraSGPRs;
}
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
bool FlatScrUsed) {
- return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed,
- Features[AMDGPU::FeatureXNACK]);
+ return getNumExtraSGPRs(STI, VCCUsed, FlatScrUsed,
+ STI->getFeatureBits().test(AMDGPU::FeatureXNACK));
}
-unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) {
- NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features));
+unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
+ NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(STI));
// SGPRBlocks is actual number of SGPR blocks minus 1.
- return NumSGPRs / getSGPREncodingGranule(Features) - 1;
+ return NumSGPRs / getSGPREncodingGranule(STI) - 1;
}
-unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) {
return 4;
}
-unsigned getVGPREncodingGranule(const FeatureBitset &Features) {
- return getVGPRAllocGranule(Features);
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) {
+ return getVGPRAllocGranule(STI);
}
-unsigned getTotalNumVGPRs(const FeatureBitset &Features) {
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
return 256;
}
-unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
- return getTotalNumVGPRs(Features);
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+ return getTotalNumVGPRs(STI);
}
-unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
if (WavesPerEU >= getMaxWavesPerEU())
return 0;
unsigned MinNumVGPRs =
- alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
- getVGPRAllocGranule(Features)) + 1;
- return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features));
+ alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
+ getVGPRAllocGranule(STI)) + 1;
+ return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI));
}
-unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU,
- getVGPRAllocGranule(Features));
- unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features);
+ unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
+ getVGPRAllocGranule(STI));
+ unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
-unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) {
- NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features));
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) {
+ NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI));
// VGPRBlocks is actual number of VGPR blocks minus 1.
- return NumVGPRs / getVGPREncodingGranule(Features) - 1;
+ return NumVGPRs / getVGPREncodingGranule(STI) - 1;
}
} // end namespace IsaInfo
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
- const FeatureBitset &Features) {
- IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features);
+ const MCSubtargetInfo *STI) {
+ IsaVersion Version = getIsaVersion(STI->getCPU());
memset(&Header, 0, sizeof(Header));
Header.amd_kernel_code_version_major = 1;
Header.amd_kernel_code_version_minor = 2;
Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
- Header.amd_machine_version_major = ISA.Major;
- Header.amd_machine_version_minor = ISA.Minor;
- Header.amd_machine_version_stepping = ISA.Stepping;
+ Header.amd_machine_version_major = Version.Major;
+ Header.amd_machine_version_minor = Version.Minor;
+ Header.amd_machine_version_stepping = Version.Stepping;
Header.kernel_code_entry_byte_offset = sizeof(Header);
// wavefront_size is specified as a power of 2: 2^6 = 64 threads.
Header.wavefront_size = 6;
@@ -513,7 +509,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
return Ints;
}
-unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getVmcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
if (Version.Major < 9)
return VmcntLo;
@@ -522,15 +518,15 @@ unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
return VmcntLo | VmcntHi;
}
-unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getExpcntBitMask(const IsaVersion &Version) {
return (1 << getExpcntBitWidth()) - 1;
}
-unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getLgkmcntBitMask(const IsaVersion &Version) {
return (1 << getLgkmcntBitWidth()) - 1;
}
-unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
+unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
@@ -542,7 +538,7 @@ unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
return Waitcnt | VmcntHi;
}
-unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) {
unsigned VmcntLo =
unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
if (Version.Major < 9)
@@ -554,22 +550,30 @@ unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
return VmcntLo | VmcntHi;
}
-unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
}
-unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
}
-void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
Vmcnt = decodeVmcnt(Version, Waitcnt);
Expcnt = decodeExpcnt(Version, Waitcnt);
Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
}
-unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
+ Waitcnt Decoded;
+ Decoded.VmCnt = decodeVmcnt(Version, Encoded);
+ Decoded.ExpCnt = decodeExpcnt(Version, Encoded);
+ Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded);
+ return Decoded;
+}
+
+unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Vmcnt) {
Waitcnt =
packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
@@ -580,17 +584,17 @@ unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
}
-unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Expcnt) {
return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
}
-unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt) {
return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
}
-unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
+unsigned encodeWaitcnt(const IsaVersion &Version,
unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
unsigned Waitcnt = getWaitcntBitMask(Version);
Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
@@ -599,6 +603,10 @@ unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
return Waitcnt;
}
+unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
+ return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
+}
+
unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
@@ -643,6 +651,10 @@ bool hasXNACK(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
}
+bool hasSRAMECC(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC];
+}
+
bool hasMIMG_R128(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
}
@@ -798,6 +810,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VS_64RegClassID:
case AMDGPU::SReg_64RegClassID:
case AMDGPU::VReg_64RegClassID:
+ case AMDGPU::SReg_64_XEXECRegClassID:
return 64;
case AMDGPU::VReg_96RegClassID:
return 96;
@@ -935,27 +948,50 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
}
-} // end namespace AMDGPU
-
-} // end namespace llvm
-
-namespace llvm {
-namespace AMDGPU {
+// Given Imm, split it into the values to put into the SOffset and ImmOffset
+// fields in an MUBUF instruction. Return false if it is not possible (due to a
+// hardware bug needing a workaround).
+//
+// The required alignment ensures that individual address components remain
+// aligned if they are aligned to begin with. It also ensures that additional
+// offsets within the given alignment can be added to the resulting ImmOffset.
+bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
+ const GCNSubtarget *Subtarget, uint32_t Align) {
+ const uint32_t MaxImm = alignDown(4095, Align);
+ uint32_t Overflow = 0;
-AMDGPUAS getAMDGPUAS(Triple T) {
- AMDGPUAS AS;
- AS.FLAT_ADDRESS = 0;
- AS.PRIVATE_ADDRESS = 5;
- AS.REGION_ADDRESS = 2;
- return AS;
-}
+ if (Imm > MaxImm) {
+ if (Imm <= MaxImm + 64) {
+ // Use an SOffset inline constant for 4..64
+ Overflow = Imm - MaxImm;
+ Imm = MaxImm;
+ } else {
+ // Try to keep the same value in SOffset for adjacent loads, so that
+ // the corresponding register contents can be re-used.
+ //
+ // Load values with all low-bits (except for alignment bits) set into
+ // SOffset, so that a larger range of values can be covered using
+ // s_movk_i32.
+ //
+ // Atomic operations fail to work correctly when individual address
+ // components are unaligned, even if their sum is aligned.
+ uint32_t High = (Imm + Align) & ~4095;
+ uint32_t Low = (Imm + Align) & 4095;
+ Imm = Low;
+ Overflow = High - Align;
+ }
+ }
-AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
- return getAMDGPUAS(M.getTargetTriple());
-}
+ // There is a hardware bug in SI and CI which prevents address clamping in
+ // MUBUF instructions from working correctly with SOffsets. The immediate
+ // offset is unaffected.
+ if (Overflow > 0 &&
+ Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
-AMDGPUAS getAMDGPUAS(const Module &M) {
- return getAMDGPUAS(Triple(M.getTargetTriple()));
+ ImmOffset = Imm;
+ SOffset = Overflow;
+ return true;
}
namespace {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5b7af8268cdaf..20123ed4ac815 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -19,6 +19,7 @@
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
#include <cstdint>
#include <string>
#include <utility>
@@ -26,8 +27,10 @@
namespace llvm {
class Argument;
+class AMDGPUSubtarget;
class FeatureBitset;
class Function;
+class GCNSubtarget;
class GlobalValue;
class MCContext;
class MCRegisterClass;
@@ -54,16 +57,6 @@ enum {
TRAP_NUM_SGPRS = 16
};
-/// Instruction set architecture version.
-struct IsaVersion {
- unsigned Major;
- unsigned Minor;
- unsigned Stepping;
-};
-
-/// \returns Isa version for given subtarget \p Features.
-IsaVersion getIsaVersion(const FeatureBitset &Features);
-
/// Streams isa version string for given subtarget \p STI into \p Stream.
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
@@ -71,114 +64,114 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
/// false otherwise.
bool hasCodeObjectV3(const MCSubtargetInfo *STI);
-/// \returns Wavefront size for given subtarget \p Features.
-unsigned getWavefrontSize(const FeatureBitset &Features);
+/// \returns Wavefront size for given subtarget \p STI.
+unsigned getWavefrontSize(const MCSubtargetInfo *STI);
-/// \returns Local memory size in bytes for given subtarget \p Features.
-unsigned getLocalMemorySize(const FeatureBitset &Features);
+/// \returns Local memory size in bytes for given subtarget \p STI.
+unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
/// \returns Number of execution units per compute unit for given subtarget \p
-/// Features.
-unsigned getEUsPerCU(const FeatureBitset &Features);
+/// STI.
+unsigned getEUsPerCU(const MCSubtargetInfo *STI);
/// \returns Maximum number of work groups per compute unit for given subtarget
-/// \p Features and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+/// \p STI and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize);
/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// Features without any kind of limitation.
-unsigned getMaxWavesPerCU(const FeatureBitset &Features);
+/// STI without any kind of limitation.
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI);
/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// Features and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+/// STI and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize);
/// \returns Minimum number of waves per execution unit for given subtarget \p
-/// Features.
-unsigned getMinWavesPerEU(const FeatureBitset &Features);
+/// STI.
+unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
/// \returns Maximum number of waves per execution unit for given subtarget \p
-/// Features without any kind of limitation.
+/// STI without any kind of limitation.
unsigned getMaxWavesPerEU();
/// \returns Maximum number of waves per execution unit for given subtarget \p
-/// Features and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+/// STI and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize);
-/// \returns Minimum flat work group size for given subtarget \p Features.
-unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features);
+/// \returns Minimum flat work group size for given subtarget \p STI.
+unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
-/// \returns Maximum flat work group size for given subtarget \p Features.
-unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features);
+/// \returns Maximum flat work group size for given subtarget \p STI.
+unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
-/// \returns Number of waves per work group for given subtarget \p Features and
+/// \returns Number of waves per work group for given subtarget \p STI and
/// limited by given \p FlatWorkGroupSize.
-unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize);
-/// \returns SGPR allocation granularity for given subtarget \p Features.
-unsigned getSGPRAllocGranule(const FeatureBitset &Features);
+/// \returns SGPR allocation granularity for given subtarget \p STI.
+unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI);
-/// \returns SGPR encoding granularity for given subtarget \p Features.
-unsigned getSGPREncodingGranule(const FeatureBitset &Features);
+/// \returns SGPR encoding granularity for given subtarget \p STI.
+unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI);
-/// \returns Total number of SGPRs for given subtarget \p Features.
-unsigned getTotalNumSGPRs(const FeatureBitset &Features);
+/// \returns Total number of SGPRs for given subtarget \p STI.
+unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI);
-/// \returns Addressable number of SGPRs for given subtarget \p Features.
-unsigned getAddressableNumSGPRs(const FeatureBitset &Features);
+/// \returns Addressable number of SGPRs for given subtarget \p STI.
+unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI);
/// \returns Minimum number of SGPRs that meets the given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
/// \returns Maximum number of SGPRs that meets the given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
bool Addressable);
/// \returns Number of extra SGPRs implicitly required by given subtarget \p
-/// Features when the given special registers are used.
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+/// STI when the given special registers are used.
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
bool FlatScrUsed, bool XNACKUsed);
/// \returns Number of extra SGPRs implicitly required by given subtarget \p
-/// Features when the given special registers are used. XNACK is inferred from
-/// \p Features.
-unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed,
+/// STI when the given special registers are used. XNACK is inferred from
+/// \p STI.
+unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
bool FlatScrUsed);
-/// \returns Number of SGPR blocks needed for given subtarget \p Features when
+/// \returns Number of SGPR blocks needed for given subtarget \p STI when
/// \p NumSGPRs are used. \p NumSGPRs should already include any special
/// register counts.
-unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
-/// \returns VGPR allocation granularity for given subtarget \p Features.
-unsigned getVGPRAllocGranule(const FeatureBitset &Features);
+/// \returns VGPR allocation granularity for given subtarget \p STI.
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI);
-/// \returns VGPR encoding granularity for given subtarget \p Features.
-unsigned getVGPREncodingGranule(const FeatureBitset &Features);
+/// \returns VGPR encoding granularity for given subtarget \p STI.
+unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI);
-/// \returns Total number of VGPRs for given subtarget \p Features.
-unsigned getTotalNumVGPRs(const FeatureBitset &Features);
+/// \returns Total number of VGPRs for given subtarget \p STI.
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
-/// \returns Addressable number of VGPRs for given subtarget \p Features.
-unsigned getAddressableNumVGPRs(const FeatureBitset &Features);
+/// \returns Addressable number of VGPRs for given subtarget \p STI.
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
/// \returns Minimum number of VGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
/// \returns Maximum number of VGPRs that meets given number of waves per
-/// execution unit requirement for given subtarget \p Features.
-unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
-/// \returns Number of VGPR blocks needed for given subtarget \p Features when
+/// \returns Number of VGPR blocks needed for given subtarget \p STI when
/// \p NumVGPRs are used.
-unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs);
+unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
} // end namespace IsaInfo
@@ -191,6 +184,7 @@ struct MIMGBaseOpcodeInfo {
bool Atomic;
bool AtomicX2;
bool Sampler;
+ bool Gather4;
uint8_t NumExtraArgs;
bool Gradients;
@@ -228,10 +222,28 @@ LLVM_READONLY
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels);
LLVM_READONLY
+int getMUBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+
+LLVM_READONLY
+int getMUBUFDwords(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMUBUFHasSoffset(unsigned Opc);
+
+LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
- const FeatureBitset &Features);
+ const MCSubtargetInfo *STI);
amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor();
@@ -265,26 +277,52 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
std::pair<int, int> Default,
bool OnlyFirstRequired = false);
+/// Represents the counter values to wait for in an s_waitcnt instruction.
+///
+/// Large values (including the maximum possible integer) can be used to
+/// represent "don't care" waits.
+struct Waitcnt {
+ unsigned VmCnt = ~0u;
+ unsigned ExpCnt = ~0u;
+ unsigned LgkmCnt = ~0u;
+
+ Waitcnt() {}
+ Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt)
+ : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {}
+
+ static Waitcnt allZero() { return Waitcnt(0, 0, 0); }
+
+ bool dominates(const Waitcnt &Other) const {
+ return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
+ LgkmCnt <= Other.LgkmCnt;
+ }
+
+ Waitcnt combined(const Waitcnt &Other) const {
+ return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
+ std::min(LgkmCnt, Other.LgkmCnt));
+ }
+};
+
/// \returns Vmcnt bit mask for given isa \p Version.
-unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getVmcntBitMask(const IsaVersion &Version);
/// \returns Expcnt bit mask for given isa \p Version.
-unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getExpcntBitMask(const IsaVersion &Version);
/// \returns Lgkmcnt bit mask for given isa \p Version.
-unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getLgkmcntBitMask(const IsaVersion &Version);
/// \returns Waitcnt bit mask for given isa \p Version.
-unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version);
+unsigned getWaitcntBitMask(const IsaVersion &Version);
/// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
+unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt);
/// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
+unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt);
/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
+unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
@@ -295,19 +333,21 @@ unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
/// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only)
/// \p Expcnt = \p Waitcnt[6:4]
/// \p Lgkmcnt = \p Waitcnt[11:8]
-void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
+Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded);
+
/// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
-unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Vmcnt);
/// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
-unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Expcnt);
/// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
-unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt);
/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
@@ -322,9 +362,11 @@ unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
///
/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
/// isa \p Version.
-unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
+unsigned encodeWaitcnt(const IsaVersion &Version,
unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
+unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
unsigned getInitialPSInputAddr(const Function &F);
LLVM_READNONE
@@ -349,6 +391,7 @@ inline bool isKernel(CallingConv::ID CC) {
}
bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasSRAMECC(const MCSubtargetInfo &STI);
bool hasMIMG_R128(const MCSubtargetInfo &STI);
bool hasPackedD16(const MCSubtargetInfo &STI);
@@ -447,6 +490,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
/// not the encoded offset.
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
+ const GCNSubtarget *Subtarget, uint32_t Align = 4);
+
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
deleted file mode 100644
index 1924f71f11c84..0000000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// MBB A lane-dominates MBB B if
-// 1. A dominates B in the usual sense, i.e. every path from the entry to B
-// goes through A, and
-// 2. whenever B executes, every active lane during that execution of B was
-// also active during the most recent execution of A.
-//
-// The simplest example where A dominates B but does not lane-dominate it is
-// where A is a loop:
-//
-// |
-// +--+
-// A |
-// +--+
-// |
-// B
-//
-// Unfortunately, the second condition is not fully captured by the control
-// flow graph when it is unstructured (as may happen when branch conditions are
-// uniform).
-//
-// The following replacement of the second condition is a conservative
-// approximation. It is an equivalent condition when the CFG is fully
-// structured:
-//
-// 2'. every cycle in the CFG that contains A also contains B.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPULaneDominator.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
-namespace llvm {
-
-namespace AMDGPU {
-
-// Given machine basic blocks A and B where A dominates B, check whether
-// A lane-dominates B.
-//
-// The check is conservative, i.e. there can be false-negatives.
-bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
- // Check whether A is reachable from itself without going through B.
- DenseSet<MachineBasicBlock *> Reachable;
- SmallVector<MachineBasicBlock *, 8> Stack;
-
- Stack.push_back(A);
- do {
- MachineBasicBlock *MBB = Stack.back();
- Stack.pop_back();
-
- for (MachineBasicBlock *Succ : MBB->successors()) {
- if (Succ == A)
- return false;
- if (Succ != B && Reachable.insert(Succ).second)
- Stack.push_back(Succ);
- }
- } while (!Stack.empty());
-
- return true;
-}
-
-} // namespace AMDGPU
-
-} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
deleted file mode 100644
index 4f33a89a364bd..0000000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-
-namespace llvm {
-
-class MachineBasicBlock;
-
-namespace AMDGPU {
-
-bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
-
-} // end namespace AMDGPU
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 9f0a4d29b5e43..82ffdef8e674a 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -46,6 +46,7 @@
int64_t Value = 0; \
if (!expectAbsExpression(MCParser, Value, Err)) \
return false; \
+ C.compute_pgm_resource_registers &= ~(SetMacro(0xFFFFFFFFFFFFFFFFULL) << Shift); \
C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
return true; \
}
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index c5ed32e46821b..01b80ebe8d3dc 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils
AMDGPUBaseInfo.cpp
AMDKernelCodeTUtils.cpp
AMDGPUAsmUtils.cpp
- AMDGPULaneDominator.cpp
)
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 4c7a92219755b..68446ab79720a 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP1";
}
+class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_DPP_Pseudo <OpName, P, pattern> {
+}
+
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret =
!if(P.HasModifiers,
@@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P,
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def _dpp : VOP1_DPP_Pseudo <opName, P>;
}
// Special profile for instructions which have clamp
@@ -173,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+let FPDPRounding = 1 in {
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+} // End FPDPRounding = 1
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
@@ -226,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+let FPDPRounding = 1 in {
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End FPDPRounding = 1
} // End SchedRW = [WriteDoubleAdd]
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
@@ -242,7 +252,9 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
let Src0RC64 = VRegSrc_32;
let HasExt = 0;
- let HasSDWA9 = 0;
+ let HasExtDPP = 0;
+ let HasExtSDWA = 0;
+ let HasExtSDWA9 = 0;
}
// Special case because there are no true output operands. Hack vdst
@@ -271,7 +283,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
let HasExt = 0;
- let HasSDWA9 = 0;
+ let HasExtDPP = 0;
+ let HasExtSDWA = 0;
+ let HasExtSDWA9 = 0;
+
let HasDst = 0;
let EmitDst = 1; // force vdst emission
}
@@ -328,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
let SubtargetPredicate = Has16BitInsts in {
+let FPDPRounding = 1 in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
+} // End FPDPRounding = 1
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
let SchedRW = [WriteQuarterRate32] in {
@@ -347,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+let FPDPRounding = 1 in {
defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+} // End FPDPRounding = 1
}
@@ -495,13 +514,8 @@ defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
// VI
//===----------------------------------------------------------------------===//
-class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
- VOP_DPP <ps.OpName, P> {
- let Defs = ps.Defs;
- let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
- let hasSideEffects = ps.hasSideEffects;
-
+class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
+ VOP_DPPe <P> {
bits<8> vdst;
let Inst{8-0} = 0xfa; // dpp
let Inst{16-9} = op;
@@ -539,9 +553,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- // For now left dpp only for asm/dasm
- // TODO: add corresponding pseudo
- def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_vi :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
}
defm V_NOP : VOP1_Real_vi <0x0>;
@@ -712,9 +727,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- // For now left dpp only for asm/dasm
- // TODO: add corresponding pseudo
- def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+
}
defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 5ec1a15c5cd20..e3fd7b5f9fadd 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP2";
}
+class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_DPP_Pseudo <OpName, P, pattern> {
+}
+
+
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst,
@@ -116,22 +121,49 @@ class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
}
-multiclass VOP2Inst <string opName,
- VOPProfile P,
- SDPatternOperator node = null_frag,
- string revOp = opName,
- bit GFX9Renamed = 0> {
-
+multiclass VOP2Inst_e32<string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit GFX9Renamed = 0> {
let renamedInGFX9 = GFX9Renamed in {
-
- def _e32 : VOP2_Pseudo <opName, P>,
+ def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+ } // End renamedInGFX9 = GFX9Renamed
+}
+multiclass VOP2Inst_e64<string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit GFX9Renamed = 0> {
+ let renamedInGFX9 = GFX9Renamed in {
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+ } // End renamedInGFX9 = GFX9Renamed
+}
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+multiclass VOP2Inst_sdwa<string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit GFX9Renamed = 0> {
+ let renamedInGFX9 = GFX9Renamed in {
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ } // End renamedInGFX9 = GFX9Renamed
+}
+multiclass VOP2Inst<string opName,
+ VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName,
+ bit GFX9Renamed = 0> :
+ VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,
+ VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
+ VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> {
+ let renamedInGFX9 = GFX9Renamed in {
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
}
@@ -144,12 +176,14 @@ multiclass VOP2bInst <string opName,
let renamedInGFX9 = GFX9Renamed in {
let SchedRW = [Write32Bit, WriteSALU] in {
let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
- def _e32 : VOP2_Pseudo <opName, P>,
+ def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -172,6 +206,9 @@ multiclass VOP2eInst <string opName,
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
+
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -211,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
- let InsDPP = (ins DstRCDPP:$old,
- Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
+ VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
@@ -230,21 +267,15 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
let HasSrc2 = 0;
let HasSrc2Mods = 0;
- let HasExt = 1;
- let HasSDWA9 = 0;
-}
-def VOP_MAC_F16 : VOP_MAC <f16> {
- // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
- // 'not a string initializer' error.
- let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f16>.ret;
+ let HasExt = 1;
+ let HasExtDPP = 1;
+ let HasExtSDWA = 1;
+ let HasExtSDWA9 = 0;
}
-def VOP_MAC_F32 : VOP_MAC <f32> {
- // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
- // 'not a string initializer' error.
- let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f32>.ret;
-}
+def VOP_MAC_F16 : VOP_MAC <f16>;
+def VOP_MAC_F32 : VOP_MAC <f32>;
// Write out to vcc or arbitrary SGPR.
def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
@@ -290,7 +321,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let HasExt = 1;
- let HasSDWA9 = 1;
+ let HasExtDPP = 1;
+ let HasExtSDWA = 1;
+ let HasExtSDWA9 = 1;
}
// Read in from vcc or arbitrary SGPR
@@ -321,7 +354,9 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let HasExt = 1;
- let HasSDWA9 = 1;
+ let HasExtDPP = 1;
+ let HasExtSDWA = 1;
+ let HasExtSDWA9 = 1;
}
def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
@@ -331,8 +366,11 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
+
let HasExt = 0;
- let HasSDWA9 = 0;
+ let HasExtDPP = 0;
+ let HasExtSDWA = 0;
+ let HasExtSDWA9 = 0;
}
def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
@@ -342,20 +380,23 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
- let HasExt = 0;
- let HasSDWA9 = 0;
let HasSrc2 = 0;
let HasSrc2Mods = 0;
+
+ let HasExt = 0;
+ let HasExtDPP = 0;
+ let HasExtSDWA = 0;
+ let HasExtSDWA9 = 0;
}
//===----------------------------------------------------------------------===//
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGCN in {
+let SubtargetPredicate = isGCN, Predicates = [isGCN] in {
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -363,29 +404,29 @@ defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
-defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
-defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
-defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
-defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
-defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
-defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
-defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
+defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
+defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
+defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
+defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>;
defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
-defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
-defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
-defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;
+defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
+defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
+defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
}
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
@@ -411,11 +452,11 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub
// These are special and do not read the exec mask.
let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
- [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
+ [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
- [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+ [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
@@ -425,13 +466,13 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
-} // End SubtargetPredicate = isGCN
+} // End SubtargetPredicate = isGCN, Predicates = [isGCN]
def : GCNPat<
(AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
@@ -444,40 +485,99 @@ def : GCNPat<
>;
// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
+let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
} // End isCommutable = 1
-} // End let SubtargetPredicate = SICI
+} // End let SubtargetPredicate = SICI, Predicates = [isSICI]
+
+class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1),
+ !if(!cast<Commutable_REV>(Inst).IsOrig,
+ (Inst $src0, $src1),
+ (Inst $src1, $src0)
+ )
+ >;
+
+let AddedComplexity = 1 in {
+ def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
+ def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
+ def : DivergentBinOp<shl, V_LSHLREV_B32_e64>;
+}
+
+let SubtargetPredicate = HasAddNoCarryInsts in {
+ def : DivergentBinOp<add, V_ADD_U32_e32>;
+ def : DivergentBinOp<sub, V_SUB_U32_e32>;
+ def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
+}
+
+
+def : DivergentBinOp<add, V_ADD_I32_e32>;
+
+def : DivergentBinOp<add, V_ADD_I32_e64>;
+def : DivergentBinOp<sub, V_SUB_I32_e32>;
+
+def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
+
+def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
+def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
+def : DivergentBinOp<shl, V_LSHLREV_B32_e32>;
+def : DivergentBinOp<adde, V_ADDC_U32_e32>;
+def : DivergentBinOp<sube, V_SUBB_U32_e32>;
+
+class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret i64:$src0, i64:$src1),
+ (REG_SEQUENCE VReg_64,
+ (Inst
+ (i32 (EXTRACT_SUBREG $src0, sub0)),
+ (i32 (EXTRACT_SUBREG $src1, sub0))
+ ), sub0,
+ (Inst
+ (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (i32 (EXTRACT_SUBREG $src1, sub1))
+ ), sub1
+ )
+ >;
+
+def : divergent_i64_BinOp <and, V_AND_B32_e32>;
+def : divergent_i64_BinOp <or, V_OR_B32_e32>;
+def : divergent_i64_BinOp <xor, V_XOR_B32_e32>;
let SubtargetPredicate = Has16BitInsts in {
+let FPDPRounding = 1 in {
def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+} // End FPDPRounding = 1
+
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
-defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
let isCommutable = 1 in {
+let FPDPRounding = 1 in {
defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
+} // End FPDPRounding = 1
defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
-defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
-defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
@@ -698,13 +798,8 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;
// VI
//===----------------------------------------------------------------------===//
-class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
- VOP_DPP <OpName, P> {
- let Defs = ps.Defs;
- let Uses = ps.Uses;
- let SchedRW = ps.SchedRW;
- let hasSideEffects = ps.hasSideEffects;
-
+class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
+ VOP_DPPe <P> {
bits<8> vdst;
bits<8> src1;
let Inst{8-0} = 0xfa; //dpp
@@ -716,12 +811,6 @@ class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfil
let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
-multiclass VOP32_Real_vi <bits<10> op> {
- def _vi :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>;
-}
-
multiclass VOP2_Real_MADK_vi <bits<6> op> {
def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
@@ -791,8 +880,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
let AsmString = AsmName # ps.AsmOperands;
}
- def _dpp :
- VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_vi :
+ VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
+ VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
}
}
@@ -819,10 +913,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
let AsmString = AsmName # ps.AsmOperands;
}
- def _dpp_gfx9 :
- VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
- let DecoderNamespace = "SDWA9";
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
+ VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
+ let AsmString = AsmName # ps.AsmOperands;
+ let DecoderNamespace = "SDWA9";
+ }
}
multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
@@ -840,19 +938,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
}
- def _dpp_gfx9 :
- VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
- let DecoderNamespace = "SDWA9";
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx9 :
+ VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
+ let DecoderNamespace = "SDWA9";
+ }
}
} // AssemblerPredicates = [isGFX9]
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
- // For now left dpp only for asm/dasm
- // TODO: add corresponding pseudo
- def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_vi :
+ VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
+ VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>;
@@ -899,9 +1001,6 @@ defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>;
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>;
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>;
-defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
-defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;
-
defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>;
defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 17ae08dc62670..4b8c1f208a0ed 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -17,16 +17,16 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
(VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
list<dag> ret3 = [(set P.DstVT:$vdst,
- (node (P.Src0VT src0),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (node (P.Src0VT src0),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT src0),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (node (P.Src0VT src0)))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -35,18 +35,18 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret3 = [(set P.DstVT:$vdst,
- (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
(VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+ (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
(P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -55,18 +55,18 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret3 = [(set P.DstVT:$vdst,
- (node (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
(VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (node !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+ (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
(P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -75,18 +75,18 @@ class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret3 = [(set P.DstVT:$vdst,
- (node (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
(VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (node !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+ (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
(P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (node (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -94,9 +94,9 @@ class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
}
class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
- list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
- list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
- list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))];
+ list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
+ list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1))];
+ list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
ret1));
@@ -185,6 +185,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf
getAsm64<HasDst, NumSrcArgs, HasIntClamp,
HasModifiers, HasOMod, DstVT>.ret,
P.Asm64));
+ let NeedPatGen = P.NeedPatGen;
}
class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
@@ -219,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
// VOP3 INTERP
//===----------------------------------------------------------------------===//
-class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
+class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
+ VOP3_Pseudo<OpName, P, pattern> {
let AsmMatchConverter = "cvtVOP3Interp";
}
@@ -291,11 +293,13 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteDoubleAdd] in {
+let FPDPRounding = 1 in {
def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+} // End FPDPRounding = 1
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
@@ -323,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
let SchedRW = [WriteDouble];
+ let FPDPRounding = 1;
}
} // End Uses = [VCC, EXEC]
@@ -353,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CL
def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
let SchedRW = [WriteFloatFMA, WriteSALU];
@@ -367,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32,
def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
let SchedRW = [WriteDouble, WriteSALU];
let AsmMatchConverter = "";
+ let FPDPRounding = 1;
}
def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -381,12 +387,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
let SchedRW = [Write64Bit] in {
// These instructions only exist on SI and CI
-let SubtargetPredicate = isSICI in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>;
+let SubtargetPredicate = isSICI, Predicates = [isSICI] in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isSICI
+} // End SubtargetPredicate = isSICI, Predicates = [isSICI]
let SubtargetPredicate = isVI in {
def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
@@ -395,6 +401,22 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
} // End SubtargetPredicate = isVI
} // End SchedRW = [Write64Bit]
+let Predicates = [isVI] in {
+def : GCNPat <
+ (getDivergentFrag<shl>.ret i64:$x, i32:$y),
+ (V_LSHLREV_B64 $y, $x)
+>;
+def : AMDGPUPat <
+ (getDivergentFrag<srl>.ret i64:$x, i32:$y),
+ (V_LSHRREV_B64 $y, $x)
+>;
+def : AMDGPUPat <
+ (getDivergentFrag<sra>.ret i64:$x, i32:$y),
+ (V_ASHRREV_I64 $y, $x)
+>;
+}
+
+
let SubtargetPredicate = isCIVI in {
let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
@@ -414,33 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
let Predicates = [Has16BitInsts, isVIOnly];
+ let FPDPRounding = 1;
}
def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
let renamedInGFX9 = 1;
let Predicates = [Has16BitInsts, isGFX9];
+ let FPDPRounding = 1;
+}
+
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
+ let Predicates = [Has16BitInsts, isVIOnly];
+ let FPDPRounding = 1;
+}
+def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
+ let renamedInGFX9 = 1;
+ let Predicates = [Has16BitInsts, isGFX9];
+ let FPDPRounding = 1;
}
let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
let renamedInGFX9 = 1 in {
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+let FPDPRounding = 1 in {
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+let Uses = [M0, EXEC] in {
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-}
+} // End Uses = [M0, EXEC]
+} // End FPDPRounding = 1
+} // End renamedInGFX9 = 1
let SubtargetPredicate = isGFX9 in {
-def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
+ let FPDPRounding = 1;
+}
def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
-def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9
+let Uses = [M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+} // End Uses = [M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -461,17 +501,6 @@ def : GCNPat <
(inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
>;
-def : GCNPat<
- (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
- (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
->;
-
-def : GCNPat<
- (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
- (REG_SEQUENCE VReg_64,
- (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0,
- (V_MOV_B32_e32 (i32 0)), sub1)
->;
}
defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
@@ -479,6 +508,37 @@ defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
} // End Predicates = [Has16BitInsts]
+class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+ (ops node:$x, node:$y, node:$z),
+ // When the inner operation is used multiple times, selecting 3-op
+ // instructions may still be beneficial -- if the other users can be
+ // combined similarly. Let's be conservative for now.
+ (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
+ [{
+ // Only use VALU ops when the result is divergent.
+ if (!N->isDivergent())
+ return false;
+
+ // Check constant bus limitations.
+ //
+ // Note: Use !isDivergent as a conservative proxy for whether the value
+ // is in an SGPR (uniform values can end up in VGPRs as well).
+ unsigned ConstantBusUses = 0;
+ for (unsigned i = 0; i < 3; ++i) {
+ if (!Operands[i]->isDivergent() &&
+ !isInlineImmediate(Operands[i].getNode())) {
+ ConstantBusUses++;
+ if (ConstantBusUses >= 2)
+ return false;
+ }
+ }
+
+ return true;
+ }]
+> {
+ let PredicateCodeUsesOperands = 1;
+}
+
let SubtargetPredicate = isGFX9 in {
def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -513,6 +573,22 @@ def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B3
def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
+
+
+class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
+ // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
+ (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
+ (inst i32:$src0, i32:$src1, i32:$src2)
+>;
+
+def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
+def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32>;
+def : ThreeOp_i32_Pats<add, add, V_ADD3_U32>;
+def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32>;
+def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>;
+def : ThreeOp_i32_Pats<or, or, V_OR3_B32>;
+def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>;
+
} // End SubtargetPredicate = isGFX9
//===----------------------------------------------------------------------===//
@@ -662,23 +738,23 @@ defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>;
let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
multiclass VOP3_Real_vi<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
multiclass VOP3be_Real_vi<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3be_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
multiclass VOP3Interp_Real_vi<bits<10> op> {
- def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
- VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
+ VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
}
} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
@@ -786,12 +862,15 @@ defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>;
defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>;
defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>;
+let FPDPRounding = 1 in {
defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">;
-defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
-defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">;
defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">;
defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">;
+} // End FPDPRounding = 1
+
+defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
+defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
@@ -824,6 +903,9 @@ defm V_MUL_LO_I32 : VOP3_Real_vi <0x285>;
defm V_MUL_HI_U32 : VOP3_Real_vi <0x286>;
defm V_MUL_HI_I32 : VOP3_Real_vi <0x287>;
+defm V_READLANE_B32 : VOP3_Real_vi <0x289>;
+defm V_WRITELANE_B32 : VOP3_Real_vi <0x28a>;
+
defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>;
defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>;
defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index b51828b546797..91b45583c8489 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -42,14 +42,16 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
}
let isCommutable = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+let FPDPRounding = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+} // End FPDPRounding = 1
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
@@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in {
let isCommutable = 1 in {
def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
let ClampLo = 0, ClampHi = 1 in {
def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
}
+} // End FPDPRounding = 1
}
defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
@@ -154,18 +158,99 @@ let SubtargetPredicate = HasFmaMixInsts in {
let isCommutable = 1 in {
def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
let ClampLo = 0, ClampHi = 1 in {
def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
}
+} // End FPDPRounding = 1
}
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
-let SubtargetPredicate = HasDLInsts in {
+// Defines patterns that extract signed 4bit from each Idx[0].
+foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in
+ def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src),
+ (sra (shl node:$src, (i32 Idx[1])), (i32 28))>;
+
+// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex.
+class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag<
+ (ops node:$src),
+ !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element
+ !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))),
+ !if (!eq (FromBitIndex, 0), // first element
+ !if (U, (and node:$src, (i32 BitMask)),
+ !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src),
+ (sext_inreg node:$src, i8))),
+ !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)),
+ !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src),
+ (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>;
+
+
+foreach Type = ["I", "U"] in
+ foreach Index = 0-3 in {
+ // Defines patterns that extract each Index'ed 8bit from an unsigned
+ // 32bit scalar value;
+ def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
+
+ // Defines multiplication patterns where the multiplication is happening on each
+ // Index'ed 8bit of a 32bit scalar value.
+
+ def Mul#Type#_Elt#Index : PatFrag<
+ (ops node:$src0, node:$src1),
+ (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse))
+ (!cast<Extract>(#Type#Index#"_8bit") node:$src0),
+ (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>;
+ }
+
+// Different variants of dot8 patterns cause a huge increase in the compile time.
+// Define non-associative/commutative add/mul to prevent permutation in the dot8
+// pattern.
+def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>;
+def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>;
+
+def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>;
+def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>;
+
+def NonACAMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24" , SDTIntBinOp>;
+def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>;
+
+foreach Type = ["I", "U"] in
+ foreach Index = 0-7 in {
+ // Defines patterns that extract each Index'ed 4bit from an unsigned
+ // 32bit scalar value;
+ def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
+
+ // Defines multiplication patterns where the multiplication is happening on each
+ // Index'ed 8bit of a 32bit scalar value.
+ def Mul#Type#Index#"_4bit" : PatFrag<
+ (ops node:$src0, node:$src1),
+ (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse))
+ (!cast<Extract>(#Type#Index#"_4bit") node:$src0),
+ (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>;
+ }
+
+class UDot2Pat<Instruction Inst> : GCNPat <
+ (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)),
+ (srl i32:$src1, (i32 16))), i32:$src2),
+ (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)),
+ (and i32:$src1, (i32 65535)))
+ ),
+ (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
+>;
+
+class SDot2Pat<Instruction Inst> : GCNPat <
+ (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)),
+ (sra i32:$src1, (i32 16))), i32:$src2),
+ (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16),
+ (sext_inreg i32:$src1, i16))),
+ (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))
+>;
+
+let SubtargetPredicate = HasDotInsts in {
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
@@ -192,7 +277,32 @@ defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
-} // End SubtargetPredicate = HasDLInsts
+def : UDot2Pat<V_DOT2_U32_U16>;
+def : SDot2Pat<V_DOT2_I32_I16>;
+
+foreach Type = ["U", "I"] in
+ def : GCNPat <
+ !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
+ (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
+ (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+foreach Type = ["U", "I"] in
+ def : GCNPat <
+ !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
+ [1, 2, 3, 4, 5, 6, 7], lhs, y,
+ (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
+ (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
+// in the compile time. Directly handle the pattern generated by the FE here.
+foreach Type = ["U", "I"] in
+ def : GCNPat <
+ !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
+ [7, 1, 2, 3, 4, 5, 6], lhs, y,
+ (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
+ (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+} // End SubtargetPredicate = HasDotInsts
multiclass VOP3P_Real_vi<bits<10> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -242,7 +352,7 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
}
-let SubtargetPredicate = HasDLInsts in {
+let SubtargetPredicate = HasDotInsts in {
defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
@@ -252,4 +362,4 @@ defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
-} // End SubtargetPredicate = HasDLInsts
+} // End SubtargetPredicate = HasDotInsts
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index cc6b8116afee1..091cac8cd35ca 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -635,6 +635,17 @@ def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>;
+
class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat <
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
@@ -656,6 +667,14 @@ def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>;
+
+
def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
@@ -670,6 +689,13 @@ def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>;
+
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index f0f7f259f71d2..7de7d90d27b3a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -420,10 +420,10 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
let SDWA = 1;
let Uses = [EXEC];
- let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst);
- let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst);
- let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
- AMDGPUAsmVariants.Disable);
+ let SubtargetPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
+ let AssemblerPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
+ let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
+ AMDGPUAsmVariants.Disable);
let DecoderNamespace = "SDWA";
VOPProfile Pfl = P;
@@ -471,10 +471,10 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
- let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
- let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
- let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
- AMDGPUAsmVariants.Disable);
+ let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
+ let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
+ let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
+ AMDGPUAsmVariants.Disable);
let DecoderNamespace = "SDWA9";
// Copy relevant pseudo op flags
@@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 {
let Inst{63-60} = row_mask;
}
-class VOP_DPP <string OpName, VOPProfile P> :
- InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
- VOP_DPPe<P> {
+class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
+ VOP <OpName>,
+ SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
+ MnemonicAlias <OpName#"_dpp", OpName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
let mayLoad = 0;
let mayStore = 0;
@@ -517,15 +522,99 @@ class VOP_DPP <string OpName, VOPProfile P> :
let VALU = 1;
let DPP = 1;
let Size = 8;
+ let Uses = [EXEC];
+ let isConvergent = 1;
+
+ string Mnemonic = OpName;
+ string AsmOperands = P.AsmDPP;
let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst);
- let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
- AMDGPUAsmVariants.Disable);
+ let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+ let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
+ AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
let DecoderNamespace = "DPP";
+
+ VOPProfile Pfl = P;
+}
+
+class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // Copy relevant pseudo op flags
+ let isConvergent = ps.isConvergent;
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AssemblerPredicate = ps.AssemblerPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let DecoderNamespace = ps.DecoderNamespace;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
+class getNumNodeArgs<SDPatternOperator Op> {
+ SDNode N = !cast<SDNode>(Op);
+ SDTypeProfile TP = N.TypeProfile;
+ int ret = TP.NumOperands;
+}
+
+
+class getDivergentFrag<SDPatternOperator Op> {
+
+ int NumSrcArgs = getNumNodeArgs<Op>.ret;
+ PatFrag ret = PatFrag <
+ !if(!eq(NumSrcArgs, 1),
+ (ops node:$src0),
+ !if(!eq(NumSrcArgs, 2),
+ (ops node:$src0, node:$src1),
+ (ops node:$src0, node:$src1, node:$src2))),
+ !if(!eq(NumSrcArgs, 1),
+ (Op $src0),
+ !if(!eq(NumSrcArgs, 2),
+ (Op $src0, $src1),
+ (Op $src0, $src1, $src2))),
+ [{ return N->isDivergent(); }]
+ >;
+}
+
+class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
+
+ PatFrag Operator = getDivergentFrag < Op >.ret;
+
+ dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator,
+ !subst(P.Src0RC32, P.Src0VT,
+ !subst(P.Src1RC32, P.Src1VT, tmp))));
+
+
+ dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set,
+ !subst(P.DstRC, P.DstVT, tmp)));
+
+ list<dag> ret = [!con(Outs, (set Ins))];
+}
+
+class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
+ list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
+}
+
+class DivergentFragOrOp<SDPatternOperator Op, VOPProfile P> {
+ SDPatternOperator ret = !if(!eq(P.NeedPatGen,PatGenMode.Pattern),
+ !if(!isa<SDNode>(Op), getDivergentFrag<Op>.ret, Op), Op);
}
include "VOPCInstructions.td"