summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AMDGPU
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td278
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp119
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h35
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp201
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp247
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp714
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td69
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp150
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFeatures.td23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td114
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def80
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h33
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp118
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp583
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp493
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h33
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInline.cpp44
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td90
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp2341
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h122
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td42
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp2747
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h93
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp56
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibFunc.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp359
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp153
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp255
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp137
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp154
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp2136
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h27
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp142
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp181
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h172
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp113
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp507
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h94
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp112
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp280
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td183
-rw-r--r--llvm/lib/Target/AMDGPU/CaymanInstructions.td9
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td59
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td75
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h8
-rw-r--r--llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp109
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h10
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp123
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h12
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp73
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h13
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp100
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h7
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td239
-rw-r--r--llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/R600AsmPrinter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/R600FrameLowering.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/R600FrameLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/R600Instructions.td9
-rw-r--r--llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/R600RegisterInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/R600RegisterInfo.td6
-rw-r--r--llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h26
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp51
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp997
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h37
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2276
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h63
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp203
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertSkips.cpp374
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp496
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1090
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h67
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td98
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td677
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp627
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp293
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp190
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h142
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp175
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp37
-rw-r--r--llvm/lib/Target/AMDGPU/SIPostRABundler.cpp139
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp326
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp1387
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h165
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td453
-rw-r--r--llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td39
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp147
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp59
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td104
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td153
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp204
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h153
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp114
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VIInstructions.td13
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td139
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td76
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td242
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td92
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td43
172 files changed, 19350 insertions, 8444 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index fbed51de0ea49..88c79665be60d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -10,15 +10,16 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
-#include "llvm/Target/TargetMachine.h"
#include "llvm/IR/IntrinsicsR600.h" // TODO: Sink this.
#include "llvm/IR/IntrinsicsAMDGPU.h" // TODO: Sink this.
+#include "llvm/Support/CodeGen.h"
namespace llvm {
class AMDGPUTargetMachine;
class FunctionPass;
class GCNTargetMachine;
+class ImmutablePass;
class ModulePass;
class Pass;
class Target;
@@ -27,6 +28,14 @@ class TargetOptions;
class PassRegistry;
class Module;
+// GlobalISel passes
+void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
+FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone);
+void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &);
+FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
+void initializeAMDGPURegBankCombinerPass(PassRegistry &);
+
// R600 Passes
FunctionPass *createR600VectorRegMerger();
FunctionPass *createR600ExpandSpecialInstrsPass();
@@ -55,8 +64,9 @@ FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
- const TargetMachine *);
+
+FunctionPass *createSIPostRABundlerPass();
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
@@ -159,6 +169,9 @@ extern char &SILowerControlFlowID;
void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
extern char &SIRemoveShortExecBranchesID;
+void initializeSIPreEmitPeepholePass(PassRegistry &);
+extern char &SIPreEmitPeepholeID;
+
void initializeSIInsertSkipsPass(PassRegistry &);
extern char &SIInsertSkipsPassID;
@@ -185,6 +198,10 @@ FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
+FunctionPass *createAMDGPUPromoteAllocaToVector();
+void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaToVectorID;
+
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(
TargetMachine *TM = nullptr,
@@ -219,12 +236,18 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
+void initializeSIInsertHardClausesPass(PassRegistry &);
+extern char &SIInsertHardClausesID;
+
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
void initializeSIFormMemoryClausesPass(PassRegistry&);
extern char &SIFormMemoryClausesID;
+void initializeSIPostRABundlerPass(PassRegistry&);
+extern char &SIPostRABundlerID;
+
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 42b477e07b3b7..e32f0fcc47713 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -33,6 +33,12 @@ def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
"Assuming f32 fma is at least as fast as mul + add"
>;
+def FeatureFastDenormalF32 : SubtargetFeature<"fast-denormal-f32",
+ "FastDenormalF32",
+ "true",
+ "Enabling denormals does not cause f32 instructions to run at f64 rates"
+>;
+
def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
"MIMG_R128",
"true",
@@ -254,6 +260,12 @@ def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
"Additional instructions for GFX10+"
>;
+def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
+ "GFX10_3Insts",
+ "true",
+ "Additional instructions for GFX10.3"
+>;
+
def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
"GFX7GFX8GFX9Insts",
"true",
@@ -360,7 +372,19 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
def FeatureR128A16 : SubtargetFeature<"r128-a16",
"HasR128A16",
"true",
- "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
+ "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands, where a16 is aliased with r128"
+>;
+
+def FeatureGFX10A16 : SubtargetFeature<"a16",
+ "HasGFX10A16",
+ "true",
+ "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands"
+>;
+
+def FeatureG16 : SubtargetFeature<"g16",
+ "HasG16",
+ "true",
+ "Support G16 for 16-bit gradient image operands"
>;
def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
@@ -369,6 +393,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
"Support NSA encoding for image instructions"
>;
+def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
+ "GFX10_BEncoding",
+ "true",
+ "Encoding format GFX10_B"
+>;
+
def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
"HasIntClamp",
"true",
@@ -439,7 +469,8 @@ def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
"HasAtomicFaddInsts",
"true",
"Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, "
- "global_atomic_pk_add_f16 instructions"
+ "global_atomic_pk_add_f16 instructions",
+ [FeatureFlatGlobalInsts]
>;
def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support",
@@ -466,6 +497,30 @@ def FeatureVscnt : SubtargetFeature<"vscnt",
"Has separate store vscnt counter"
>;
+def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst",
+ "HasGetWaveIdInst",
+ "true",
+ "Has s_get_waveid_in_workgroup instruction"
+>;
+
+def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst",
+ "HasSMemTimeInst",
+ "true",
+ "Has s_memtime instruction"
+>;
+
+def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
+ "HasMadMacF32Insts",
+ "true",
+ "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
+>;
+
+def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
+ "HasDsSrc2Insts",
+ "true",
+ "Has ds_*_src2 instructions"
+>;
+
def FeatureRegisterBanking : SubtargetFeature<"register-banking",
"HasRegisterBanking",
"true",
@@ -488,36 +543,6 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
-// Denormal handling for fp64 and fp16 is controlled by the same
-// config register when fp16 supported.
-// TODO: Do we need a separate f16 setting when not legal?
-def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals",
- "FP64FP16Denormals",
- "true",
- "Enable double and half precision denormal handling",
- [FeatureFP64]
->;
-
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
- "FP64FP16Denormals",
- "true",
- "Enable double and half precision denormal handling",
- [FeatureFP64, FeatureFP64FP16Denormals]
->;
-
-def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
- "FP64FP16Denormals",
- "true",
- "Enable half precision denormal handling",
- [FeatureFP64FP16Denormals]
->;
-
-def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
- "FPExceptions",
- "true",
- "Enable floating point exceptions"
->;
-
class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
"max-private-element-size-"#size,
"MaxPrivateElementSize",
@@ -628,9 +653,10 @@ class GCNSubtargetFeatureGeneration <string Value,
def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
"southern-islands",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
- FeatureWavefrontSize64,
- FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange,
- FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK]
+ FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
+ FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
+ FeatureDoesNotSupportXNACK]
>;
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -638,7 +664,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
- FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC]
+ FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC]
>;
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -649,8 +676,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
- FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
- FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts
+ FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
+ FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32
]
>;
@@ -665,7 +693,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
- FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16
+ FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
+ FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
+ FeatureFastDenormalF32
]
>;
@@ -682,7 +712,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
FeatureVOP3Literal, FeatureDPP8,
- FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC,
+ FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16
]
>;
@@ -853,6 +884,10 @@ def FeatureISAVersion10_1_0 : FeatureSet<
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
+ FeatureGetWaveIdInst,
+ FeatureSMemTimeInst,
+ FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3])>;
@@ -871,6 +906,10 @@ def FeatureISAVersion10_1_1 : FeatureSet<
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
+ FeatureGetWaveIdInst,
+ FeatureSMemTimeInst,
+ FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts,
FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3])>;
@@ -888,10 +927,29 @@ def FeatureISAVersion10_1_2 : FeatureSet<
FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
+ FeatureGetWaveIdInst,
+ FeatureSMemTimeInst,
+ FeatureMadMacF32Insts,
+ FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3])>;
+def FeatureISAVersion10_3_0 : FeatureSet<
+ [FeatureGFX10,
+ FeatureGFX10_BEncoding,
+ FeatureGFX10_3Insts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureDoesNotSupportXNACK,
+ FeatureCodeObjectV3]>;
+
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
@@ -973,190 +1031,222 @@ def NullALU : InstrItinClass;
def isGFX6 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">,
- AssemblerPredicate<"FeatureSouthernIslands">;
+ AssemblerPredicate<(all_of FeatureSouthernIslands)>;
def isGFX6GFX7 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), (not FeatureGFX10Insts))>;
def isGFX6GFX7GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
- AssemblerPredicate<"!FeatureGCN3Encoding">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding))>;
def isGFX7Only :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts, (not FeatureGFX10Insts))>;
def isGFX7GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
- AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>;
def isGFX7GFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
- AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">;
+ AssemblerPredicate<(all_of FeatureGFX7GFX8GFX9Insts)>;
def isGFX6GFX7GFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
- AssemblerPredicate<"!FeatureGFX10Insts">;
+ AssemblerPredicate<(all_of (not FeatureGFX10Insts))>;
def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
- AssemblerPredicate<"FeatureCIInsts">;
+ AssemblerPredicate<(all_of FeatureCIInsts)>;
def isGFX8Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<"FeatureGFX8Insts">;
+ AssemblerPredicate<(all_of FeatureGFX8Insts)>;
def isGFX8Only : Predicate<"Subtarget->getGeneration() =="
"AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate <"FeatureVolcanicIslands">;
+ AssemblerPredicate <(all_of FeatureVolcanicIslands)>;
def isGFX9Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
- AssemblerPredicate<"FeatureGFX9Insts">;
+ AssemblerPredicate<(all_of FeatureGFX9Insts)>;
def isGFX9Only : Predicate <
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
- AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">;
+ AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>;
def isGFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
- AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">;
+ AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding)>;
def isGFX10Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
- AssemblerPredicate<"FeatureGFX10Insts">;
+ AssemblerPredicate<(all_of FeatureGFX10Insts)>;
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
- AssemblerPredicate<"FeatureFlatAddressSpace">;
+ AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
- AssemblerPredicate<"FeatureFlatGlobalInsts">;
+ AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
- AssemblerPredicate<"FeatureFlatScratchInsts">;
+ AssemblerPredicate<(all_of FeatureFlatScratchInsts)>;
def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
- AssemblerPredicate<"FeatureScalarFlatScratchInsts">;
+ AssemblerPredicate<(all_of FeatureScalarFlatScratchInsts)>;
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
- AssemblerPredicate<"FeatureGFX9Insts">;
+ AssemblerPredicate<(all_of FeatureGFX9Insts)>;
+
+def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
+ AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
- AssemblerPredicate<"FeatureUnpackedD16VMem">;
+ AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>;
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
- AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+ AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
- AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
+ AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
- AssemblerPredicate<"FeatureGFX9Insts">;
+ AssemblerPredicate<(all_of FeatureGFX9Insts)>;
+
+def HasLDSFPAtomics : Predicate<"Subtarget->hasLDSFPAtomics()">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts)>;
def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
- AssemblerPredicate<"FeatureAddNoCarryInsts">;
+ AssemblerPredicate<(all_of FeatureAddNoCarryInsts)>;
def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
- AssemblerPredicate<"Feature16BitInsts">;
+ AssemblerPredicate<(all_of Feature16BitInsts)>;
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
- AssemblerPredicate<"FeatureVOP3P">;
+ AssemblerPredicate<(all_of FeatureVOP3P)>;
+
+def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">;
+def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">;
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
- AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
+ AssemblerPredicate<(all_of FeatureSDWA, FeatureVolcanicIslands)>;
def HasSDWA9 :
Predicate<"Subtarget->hasSDWA()">,
- AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">;
+ AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts,FeatureSDWA)>;
def HasSDWA10 :
Predicate<"Subtarget->hasSDWA()">,
- AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureSDWA)>;
def HasDPP : Predicate<"Subtarget->hasDPP()">,
- AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">;
+ AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureDPP)>;
def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
- AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
- AssemblerPredicate<"FeatureR128A16">;
+ AssemblerPredicate<(all_of FeatureR128A16)>;
+
+def HasGFX10A16 : Predicate<"Subtarget->hasGFX10A16()">,
+ AssemblerPredicate<(all_of FeatureGFX10A16)>;
+
+def HasG16 : Predicate<"Subtarget->hasG16()">,
+ AssemblerPredicate<(all_of FeatureG16)>;
def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
- AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">;
+ AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>;
def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
- AssemblerPredicate<"FeatureIntClamp">;
+ AssemblerPredicate<(all_of FeatureIntClamp)>;
def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
- AssemblerPredicate<"FeatureMadMixInsts">;
+ AssemblerPredicate<(all_of FeatureMadMixInsts)>;
def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
- AssemblerPredicate<"FeatureScalarStores">;
+ AssemblerPredicate<(all_of FeatureScalarStores)>;
def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
- AssemblerPredicate<"FeatureScalarAtomics">;
+ AssemblerPredicate<(all_of FeatureScalarAtomics)>;
def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
- AssemblerPredicate<"FeatureNoSdstCMPX">;
+ AssemblerPredicate<(all_of FeatureNoSdstCMPX)>;
def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
- AssemblerPredicate<"!FeatureNoSdstCMPX">;
+ AssemblerPredicate<(all_of (not FeatureNoSdstCMPX))>;
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
- AssemblerPredicate<"FeatureVGPRIndexMode">;
+ AssemblerPredicate<(all_of FeatureVGPRIndexMode)>;
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
- AssemblerPredicate<"FeatureMovrel">;
+ AssemblerPredicate<(all_of FeatureMovrel)>;
def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
- AssemblerPredicate<"FeatureFmaMixInsts">;
+ AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
- AssemblerPredicate<"FeatureDLInsts">;
+ AssemblerPredicate<(all_of FeatureDLInsts)>;
def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">,
- AssemblerPredicate<"FeatureDot1Insts">;
+ AssemblerPredicate<(all_of FeatureDot1Insts)>;
def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">,
- AssemblerPredicate<"FeatureDot2Insts">;
+ AssemblerPredicate<(all_of FeatureDot2Insts)>;
def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">,
- AssemblerPredicate<"FeatureDot3Insts">;
+ AssemblerPredicate<(all_of FeatureDot3Insts)>;
def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">,
- AssemblerPredicate<"FeatureDot4Insts">;
+ AssemblerPredicate<(all_of FeatureDot4Insts)>;
def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
- AssemblerPredicate<"FeatureDot5Insts">;
+ AssemblerPredicate<(all_of FeatureDot5Insts)>;
def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
- AssemblerPredicate<"FeatureDot6Insts">;
+ AssemblerPredicate<(all_of FeatureDot6Insts)>;
+
+def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
+ AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
- AssemblerPredicate<"FeatureMAIInsts">;
+ AssemblerPredicate<(all_of FeatureMAIInsts)>;
+
+def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
+ AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
+
+def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">;
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
- AssemblerPredicate<"FeaturePkFmacF16Inst">;
+ AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
+
+def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
+ AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
- AssemblerPredicate<"FeatureAtomicFaddInsts">;
+ AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
+
+def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">,
+ AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>;
+
+def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
+ AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
- AssemblerPredicate<"FeatureOffset3fBug">;
+ AssemblerPredicate<(all_of FeatureOffset3fBug)>;
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
@@ -1165,7 +1255,7 @@ def EnableLateCFGStructurize : Predicate<
include "SISchedule.td"
include "GCNProcessors.td"
include "AMDGPUInstrInfo.td"
-include "AMDGPURegisterInfo.td"
+include "SIRegisterInfo.td"
include "AMDGPURegisterBanks.td"
include "AMDGPUInstructions.td"
include "SIInstrInfo.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index bba132c3bc46f..bb2aba0449748 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -91,12 +91,16 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
AAQueryInfo &AAQI, bool OrLocal) {
+ unsigned AS = Loc.Ptr->getType()->getPointerAddressSpace();
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return true;
+
const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
- unsigned AS = Base->getType()->getPointerAddressSpace();
+ AS = Base->getType()->getPointerAddressSpace();
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
- }
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
if (GV->isConstant())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index fb722920900f0..fd8889ea5c0dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -48,10 +48,6 @@ public:
AAQueryInfo &AAQI);
bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
bool OrLocal);
-
-private:
- bool Aliases(const MDNode *A, const MDNode *B) const;
- bool PathAliases(const MDNode *A, const MDNode *B) const;
};
/// Analysis pass providing a never-invalidated alias analysis result.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index ff2bda6bed533..22947544ac07f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -71,6 +71,13 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers(
if (Instruction *I = dyn_cast<Instruction>(U)) {
Function *F = I->getParent()->getParent();
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+ // FIXME: This is a horrible hack. We should always respect noinline,
+ // and just let us hit the error when we can't handle this.
+ //
+ // Unfortunately, clang adds noinline to all functions at -O0. We have
+ // to override this here. until that's fixed.
+ F->removeFnAttr(Attribute::NoInline);
+
FuncsToAlwaysInline.insert(F);
Stack.push_back(F);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index e72b3f4fde633..625074569cfa4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -21,7 +21,6 @@
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -71,7 +70,8 @@ public:
static bool visitConstantExpr(const ConstantExpr *CE);
static bool visitConstantExprsRecursively(
const Constant *EntryC,
- SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+ SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
+ bool HasApertureRegs);
};
} // end anonymous namespace
@@ -93,6 +93,14 @@ static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
return castRequiresQueuePtr(ASC->getSrcAddressSpace());
}
+static bool isDSAddress(const Constant *C) {
+ const GlobalValue *GV = dyn_cast<GlobalValue>(C);
+ if (!GV)
+ return false;
+ unsigned AS = GV->getAddressSpace();
+ return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
+}
+
bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
@@ -104,7 +112,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
const Constant *EntryC,
- SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+ SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+ bool IsFunc, bool HasApertureRegs) {
if (!ConstantExprVisited.insert(EntryC).second)
return false;
@@ -115,9 +124,13 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
while (!Stack.empty()) {
const Constant *C = Stack.pop_back_val();
+ // We need to trap on DS globals in non-entry functions.
+ if (IsFunc && isDSAddress(C))
+ return true;
+
// Check this constant expression.
if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
- if (visitConstantExpr(CE))
+ if (!HasApertureRegs && visitConstantExpr(CE))
return true;
}
@@ -202,7 +215,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
- "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
+ "amdgpu-implicitarg-ptr"};
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
@@ -263,10 +276,10 @@ bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
- bool HasFlat = ST.hasFlatAddressSpace();
bool HasApertureRegs = ST.hasApertureRegs();
SmallPtrSet<const Constant *, 8> ConstantExprVisited;
+ bool HaveStackObjects = false;
bool Changed = false;
bool NeedQueuePtr = false;
bool HaveCall = false;
@@ -274,13 +287,18 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
- CallSite CS(&I);
- if (CS) {
- Function *Callee = CS.getCalledFunction();
+ if (isa<AllocaInst>(I)) {
+ HaveStackObjects = true;
+ continue;
+ }
+
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ const Function *Callee =
+ dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
// TODO: Do something with indirect calls.
if (!Callee) {
- if (!CS.isInlineAsm())
+ if (!CB->isInlineAsm())
HaveCall = true;
continue;
}
@@ -292,20 +310,25 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
} else {
bool NonKernelOnly = false;
- StringRef AttrName = intrinsicToAttrName(IID,
- NonKernelOnly, NeedQueuePtr);
- if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
- F.addFnAttr(AttrName);
- Changed = true;
+
+ if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
+ F.addFnAttr("amdgpu-kernarg-segment-ptr");
+ } else {
+ StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
+ NeedQueuePtr);
+ if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+ F.addFnAttr(AttrName);
+ Changed = true;
+ }
}
}
}
- if (NeedQueuePtr || HasApertureRegs)
+ if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
continue;
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
- if (castRequiresQueuePtr(ASC)) {
+ if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
NeedQueuePtr = true;
continue;
}
@@ -316,7 +339,8 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
if (!OpC)
continue;
- if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
+ if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
+ HasApertureRegs)) {
NeedQueuePtr = true;
break;
}
@@ -332,8 +356,13 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
// TODO: We could refine this to captured pointers that could possibly be
// accessed by flat instructions. For now this is mostly a poor way of
// estimating whether there are calls before argument lowering.
- if (HasFlat && !IsFunc && HaveCall) {
- F.addFnAttr("amdgpu-flat-scratch");
+ if (!IsFunc && HaveCall) {
+ F.addFnAttr("amdgpu-calls");
+ Changed = true;
+ }
+
+ if (HaveStackObjects) {
+ F.addFnAttr("amdgpu-stack-objects");
Changed = true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 6fb507083cef1..b09e92c07f9ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -35,7 +36,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
MemoryDependenceResults *MDR;
LoopInfo *LI;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
- bool isKernelFunc;
+ bool isEntryFunc;
public:
static char ID;
@@ -127,11 +128,10 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
auto isGlobalLoad = [&](LoadInst &Load)->bool {
return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
};
- // We're tracking up to the Function boundaries
- // We cannot go beyond because of FunctionPass restrictions
- // Thus we can ensure that memory not clobbered for memory
- // operations that live in kernel only.
- bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I);
+ // We're tracking up to the Function boundaries, and cannot go beyond because
+ // of FunctionPass restrictions. We can ensure that is memory not clobbered
+ // for memory operations that are live in to entry points only.
+ bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I);
Instruction *PtrI = dyn_cast<Instruction>(Ptr);
if (!PtrI && NotClobbered && isGlobalLoad(I)) {
if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
@@ -170,7 +170,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
DA = &getAnalysis<LegacyDivergenceAnalysis>();
MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);
noClobberClones.clear();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 99a01ca3a2fda..d078fc147a36a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -8,6 +8,8 @@
#include "AMDGPU.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
@@ -43,6 +45,10 @@ char AMDGPUArgumentUsageInfo::ID = 0;
const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
+// Hardcoded registers from fixed function ABI
+const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
+ = AMDGPUFunctionArgInfo::fixedABILayout();
+
bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
return false;
}
@@ -77,59 +83,102 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
}
}
-std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
AMDGPUFunctionArgInfo::getPreloadedValue(
- AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+ AMDGPUFunctionArgInfo::PreloadedValue Value) const {
switch (Value) {
case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: {
- return std::make_pair(
- PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr,
- &AMDGPU::SGPR_128RegClass);
+ return std::make_tuple(PrivateSegmentBuffer ? &PrivateSegmentBuffer
+ : nullptr,
+ &AMDGPU::SGPR_128RegClass, LLT::vector(4, 32));
}
case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR:
- return std::make_pair(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass,
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
- return std::make_pair(WorkGroupIDX ? &WorkGroupIDX : nullptr,
- &AMDGPU::SGPR_32RegClass);
-
+ return std::make_tuple(WorkGroupIDX ? &WorkGroupIDX : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
- return std::make_pair(WorkGroupIDY ? &WorkGroupIDY : nullptr,
- &AMDGPU::SGPR_32RegClass);
+ return std::make_tuple(WorkGroupIDY ? &WorkGroupIDY : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
- return std::make_pair(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
- &AMDGPU::SGPR_32RegClass);
+ return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
- return std::make_pair(
- PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
- &AMDGPU::SGPR_32RegClass);
+ return std::make_tuple(
+ PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
+ &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR:
- return std::make_pair(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass,
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
case AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR:
- return std::make_pair(ImplicitArgPtr ? &ImplicitArgPtr : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(ImplicitArgPtr ? &ImplicitArgPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass,
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
case AMDGPUFunctionArgInfo::DISPATCH_ID:
- return std::make_pair(DispatchID ? &DispatchID : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(DispatchID ? &DispatchID : nullptr,
+ &AMDGPU::SGPR_64RegClass, LLT::scalar(64));
case AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT:
- return std::make_pair(FlatScratchInit ? &FlatScratchInit : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(FlatScratchInit ? &FlatScratchInit : nullptr,
+ &AMDGPU::SGPR_64RegClass, LLT::scalar(64));
case AMDGPUFunctionArgInfo::DISPATCH_PTR:
- return std::make_pair(DispatchPtr ? &DispatchPtr : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(DispatchPtr ? &DispatchPtr : nullptr,
+ &AMDGPU::SGPR_64RegClass,
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
case AMDGPUFunctionArgInfo::QUEUE_PTR:
- return std::make_pair(QueuePtr ? &QueuePtr : nullptr,
- &AMDGPU::SGPR_64RegClass);
+ return std::make_tuple(QueuePtr ? &QueuePtr : nullptr,
+ &AMDGPU::SGPR_64RegClass,
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
case AMDGPUFunctionArgInfo::WORKITEM_ID_X:
- return std::make_pair(WorkItemIDX ? &WorkItemIDX : nullptr,
- &AMDGPU::VGPR_32RegClass);
+ return std::make_tuple(WorkItemIDX ? &WorkItemIDX : nullptr,
+ &AMDGPU::VGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::WORKITEM_ID_Y:
- return std::make_pair(WorkItemIDY ? &WorkItemIDY : nullptr,
- &AMDGPU::VGPR_32RegClass);
+ return std::make_tuple(WorkItemIDY ? &WorkItemIDY : nullptr,
+ &AMDGPU::VGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::WORKITEM_ID_Z:
- return std::make_pair(WorkItemIDZ ? &WorkItemIDZ : nullptr,
- &AMDGPU::VGPR_32RegClass);
+ return std::make_tuple(WorkItemIDZ ? &WorkItemIDZ : nullptr,
+ &AMDGPU::VGPR_32RegClass, LLT::scalar(32));
}
llvm_unreachable("unexpected preloaded value type");
}
+
+constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
+ AMDGPUFunctionArgInfo AI;
+ AI.PrivateSegmentBuffer
+ = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);
+ AI.DispatchPtr = ArgDescriptor::createRegister(AMDGPU::SGPR4_SGPR5);
+ AI.QueuePtr = ArgDescriptor::createRegister(AMDGPU::SGPR6_SGPR7);
+
+ // Do not pass kernarg segment pointer, only pass increment version in its
+ // place.
+ AI.ImplicitArgPtr = ArgDescriptor::createRegister(AMDGPU::SGPR8_SGPR9);
+ AI.DispatchID = ArgDescriptor::createRegister(AMDGPU::SGPR10_SGPR11);
+
+ // Skip FlatScratchInit/PrivateSegmentSize
+ AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12);
+ AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13);
+ AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14);
+
+ const unsigned Mask = 0x3ff;
+ AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask);
+ AI.WorkItemIDY = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 10);
+ AI.WorkItemIDZ = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 20);
+ return AI;
+}
+
+const AMDGPUFunctionArgInfo &
+AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
+ auto I = ArgInfoMap.find(&F);
+ if (I == ArgInfoMap.end()) {
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+ return FixedABIFunctionInfo;
+
+ // Without the fixed ABI, we assume no function has special inputs.
+ assert(F.isDeclaration());
+ return ExternFunctionInfo;
+ }
+
+ return I->second;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f0e7ee910f957..576e6cfe929e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -11,15 +11,13 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/Register.h"
-#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
namespace llvm {
class Function;
class raw_ostream;
-class GCNSubtarget;
-class TargetMachine;
class TargetRegisterClass;
class TargetRegisterInfo;
@@ -40,19 +38,22 @@ private:
bool IsSet : 1;
public:
- ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+ constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
bool IsStack = false, bool IsSet = false)
: Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
- static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
+ static constexpr ArgDescriptor createRegister(Register Reg,
+ unsigned Mask = ~0u) {
return ArgDescriptor(Reg, Mask, false, true);
}
- static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
+ static constexpr ArgDescriptor createStack(unsigned Offset,
+ unsigned Mask = ~0u) {
return ArgDescriptor(Offset, Mask, true, true);
}
- static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+ static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
+ unsigned Mask) {
return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
}
@@ -141,25 +142,29 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor ImplicitArgPtr;
// Input registers for non-HSA ABI
- ArgDescriptor ImplicitBufferPtr = 0;
+ ArgDescriptor ImplicitBufferPtr;
// VGPRs inputs. These are always v0, v1 and v2 for entry functions.
ArgDescriptor WorkItemIDX;
ArgDescriptor WorkItemIDY;
ArgDescriptor WorkItemIDZ;
- std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+ std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
getPreloadedValue(PreloadedValue Value) const;
+
+ static constexpr AMDGPUFunctionArgInfo fixedABILayout();
};
class AMDGPUArgumentUsageInfo : public ImmutablePass {
private:
- static const AMDGPUFunctionArgInfo ExternFunctionInfo;
DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
public:
static char ID;
+ static const AMDGPUFunctionArgInfo ExternFunctionInfo;
+ static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
+
AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -175,15 +180,7 @@ public:
ArgInfoMap[&F] = ArgInfo;
}
- const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
- auto I = ArgInfoMap.find(&F);
- if (I == ArgInfoMap.end()) {
- assert(F.isDeclaration());
- return ExternFunctionInfo;
- }
-
- return I->second;
- }
+ const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 9e07b4d252b78..eef8fe2fc3b70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -49,9 +49,25 @@ using namespace llvm;
using namespace llvm::AMDGPU;
using namespace llvm::AMDGPU::HSAMD;
-// TODO: This should get the default rounding mode from the kernel. We just set
-// the default here, but this could change if the OpenCL rounding mode pragmas
-// are used.
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+ "amdgpu-assume-external-call-stack-size",
+ cl::desc("Assumed stack use of any external call (in bytes)"),
+ cl::Hidden,
+ cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+ "amdgpu-assume-dynamic-stack-object-size",
+ cl::desc("Assumed extra stack use if there are any "
+ "variable sized objects (in bytes)"),
+ cl::Hidden,
+ cl::init(4096));
+
+// This should get the default rounding mode from the kernel. We just set the
+// default here, but this could change if the OpenCL rounding mode pragmas are
+// used.
//
// The denormal mode here should match what is reported by the OpenCL runtime
// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
@@ -70,18 +86,10 @@ using namespace llvm::AMDGPU::HSAMD;
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
-
- // TODO: Is there any real use for the flush in only / flush out only modes?
- uint32_t FP32Denormals =
- Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
- uint32_t FP64Denormals =
- Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
- FP_DENORM_MODE_SP(FP32Denormals) |
- FP_DENORM_MODE_DP(FP64Denormals);
+ FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
+ FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
}
static AsmPrinter *
@@ -120,7 +128,7 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
}
-void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
std::string ExpectedTarget;
raw_string_ostream ExpectedTargetOS(ExpectedTarget);
@@ -152,7 +160,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
// Following code requires TargetStreamer to be present.
if (!getTargetStreamer())
return;
@@ -188,7 +196,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
}
-void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
+void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
return;
@@ -207,7 +215,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
}
-void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
return;
@@ -226,7 +234,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
// CP microcode requires the kernel descriptor to be allocated on 64 byte
// alignment.
- Streamer.EmitValueToAlignment(64, 0, 1, 0);
+ Streamer.emitValueToAlignment(64, 0, 1, 0);
if (ReadOnlySection.getAlignment() < 64)
ReadOnlySection.setAlignment(Align(64));
@@ -247,10 +255,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
Streamer.PopSection();
}
-void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
TM.getTargetTriple().getOS() == Triple::AMDHSA) {
- AsmPrinter::EmitFunctionEntryLabel();
+ AsmPrinter::emitFunctionEntryLabel();
return;
}
@@ -269,10 +277,10 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
HexLines.push_back("");
}
- AsmPrinter::EmitFunctionEntryLabel();
+ AsmPrinter::emitFunctionEntryLabel();
}
-void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
+void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
@@ -281,10 +289,10 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
HexLines.push_back("");
}
- AsmPrinter::EmitBasicBlockStart(MBB);
+ AsmPrinter::emitBasicBlockStart(MBB);
}
-void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
OutContext.reportError({},
@@ -307,18 +315,16 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
const DataLayout &DL = GV->getParent()->getDataLayout();
uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
- unsigned Align = GV->getAlignment();
- if (!Align)
- Align = 4;
+ Align Alignment = GV->getAlign().getValueOr(Align(4));
- EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
- EmitLinkage(GV, GVSym);
+ emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+ emitLinkage(GV, GVSym);
if (auto TS = getTargetStreamer())
- TS->emitAMDGPULDS(GVSym, Size, Align);
+ TS->emitAMDGPULDS(GVSym, Size, Alignment);
return;
}
- AsmPrinter::EmitGlobalVariable(GV);
+ AsmPrinter::emitGlobalVariable(GV);
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
@@ -468,7 +474,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
HexLines.clear();
DisasmLineMaxLen = 0;
- EmitFunctionBody();
+ emitFunctionBody();
if (isVerbose()) {
MCSectionELF *CommentSection =
@@ -549,7 +555,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (DumpCodeInstEmitter) {
OutStreamer->SwitchSection(
- Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
+ Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
for (size_t i = 0; i < DisasmLines.size(); ++i) {
std::string Comment = "\n";
@@ -558,8 +564,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Comment += " ; " + HexLines[i] + "\n";
}
- OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
- OutStreamer->EmitBytes(StringRef(Comment));
+ OutStreamer->emitBytes(StringRef(DisasmLines[i]));
+ OutStreamer->emitBytes(StringRef(Comment));
}
}
@@ -609,6 +615,15 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
return std::max(NumVGPR, NumAGPR);
}
+static const Function *getCalleeFunction(const MachineOperand &Op) {
+ if (Op.isImm()) {
+ assert(Op.getImm() == 0);
+ return nullptr;
+ }
+
+ return cast<Function>(Op.getGlobal());
+}
+
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const MachineFunction &MF) const {
SIFunctionResourceInfo Info;
@@ -636,11 +651,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.UsesFlatScratch = false;
}
- Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
Info.PrivateSegmentSize = FrameInfo.getStackSize();
- if (MFI->isStackRealigned())
- Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
+ // Assume a big number if there are any unknown sized objects.
+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+ if (Info.HasDynamicallySizedStack)
+ Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
+ if (MFI->isStackRealigned())
+ Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
@@ -715,6 +734,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::SRC_PRIVATE_BASE:
case AMDGPU::SRC_PRIVATE_LIMIT:
case AMDGPU::SGPR_NULL:
+ case AMDGPU::MODE:
continue;
case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
@@ -727,6 +747,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
case AMDGPU::VCC:
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
+ case AMDGPU::VCC_LO_LO16:
+ case AMDGPU::VCC_LO_HI16:
+ case AMDGPU::VCC_HI_LO16:
+ case AMDGPU::VCC_HI_HI16:
Info.UsesVCC = true;
continue;
@@ -764,15 +788,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
break;
}
- if (AMDGPU::SReg_32RegClass.contains(Reg)) {
+ if (AMDGPU::SReg_32RegClass.contains(Reg) ||
+ AMDGPU::SReg_LO16RegClass.contains(Reg) ||
+ AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
"trap handler registers should not be used");
IsSGPR = true;
Width = 1;
- } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
+ } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+ AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
IsSGPR = false;
Width = 1;
- } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
+ } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
IsSGPR = false;
IsAGPR = true;
Width = 1;
@@ -794,6 +823,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
IsSGPR = true;
Width = 3;
+ } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 3;
} else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -812,6 +845,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
IsSGPR = true;
Width = 5;
+ } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 5;
+ } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ Width = 6;
+ } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+ IsSGPR = true;
+ Width = 6;
+ } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 6;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -820,6 +867,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
IsSGPR = false;
Width = 8;
+ } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+ IsSGPR = false;
+ IsAGPR = true;
+ Width = 8;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
"trap handler registers should not be used");
@@ -862,8 +913,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const MachineOperand *CalleeOp
= TII->getNamedOperand(MI, AMDGPU::OpName::callee);
- const Function *Callee = cast<Function>(CalleeOp->getGlobal());
- if (Callee->isDeclaration()) {
+
+ const Function *Callee = getCalleeFunction(*CalleeOp);
+ if (!Callee || Callee->isDeclaration()) {
// If this is a call to an external function, we can't do much. Make
// conservative guesses.
@@ -874,7 +926,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
MaxVGPR = std::max(MaxVGPR, 23);
MaxAGPR = std::max(MaxAGPR, 23);
- CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
+ CalleeFrameSize = std::max(CalleeFrameSize,
+ static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
Info.UsesVCC = true;
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
@@ -906,7 +960,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.HasRecursion |= I->second.HasRecursion;
}
- if (!Callee->doesNotRecurse())
+ // FIXME: Call site could have norecurse on it
+ if (!Callee || !Callee->doesNotRecurse())
Info.HasRecursion = true;
}
}
@@ -1108,7 +1163,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
- ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+ ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU);
}
@@ -1132,40 +1187,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+ OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
- OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
+ OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc1);
- OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
- OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
+ OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
+ OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
- OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
- OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+ OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
+ OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
} else {
- OutStreamer->EmitIntValue(RsrcReg, 4);
- OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+ OutStreamer->emitInt32(RsrcReg);
+ OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
- OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
- OutStreamer->EmitIntValue(
+ OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
+ OutStreamer->emitIntValue(
S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
- OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
- OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
- OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
- OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
+ OutStreamer->emitInt32(
+ S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+ OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
+ OutStreamer->emitInt32(MFI->getPSInputEnable());
+ OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
+ OutStreamer->emitInt32(MFI->getPSInputAddr());
}
- OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
- OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
- OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
- OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
+ OutStreamer->emitInt32(R_SPILLED_SGPRS);
+ OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
+ OutStreamer->emitInt32(R_SPILLED_VGPRS);
+ OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
}
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
@@ -1304,7 +1360,18 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
*MF->getSubtarget().getRegisterInfo());
return false;
+ } else if (MO.isImm()) {
+ int64_t Val = MO.getImm();
+ if (AMDGPU::isInlinableIntLiteral(Val)) {
+ O << Val;
+ } else if (isUInt<16>(Val)) {
+ O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
+ } else if (isUInt<32>(Val)) {
+ O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
+ } else {
+ O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
+ }
+ return false;
}
-
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index c50c19a4609c6..54e8338ab4b04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -121,21 +121,21 @@ public:
const MachineInstr *MI);
/// Implemented in AMDGPUMCInstLower.cpp
- void EmitInstruction(const MachineInstr *MI) override;
+ void emitInstruction(const MachineInstr *MI) override;
- void EmitFunctionBodyStart() override;
+ void emitFunctionBodyStart() override;
- void EmitFunctionBodyEnd() override;
+ void emitFunctionBodyEnd() override;
- void EmitFunctionEntryLabel() override;
+ void emitFunctionEntryLabel() override;
- void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
+ void emitBasicBlockStart(const MachineBasicBlock &MBB) override;
- void EmitGlobalVariable(const GlobalVariable *GV) override;
+ void emitGlobalVariable(const GlobalVariable *GV) override;
- void EmitStartOfAsmFile(Module &M) override;
+ void emitStartOfAsmFile(Module &M) override;
- void EmitEndOfAsmFile(Module &M) override;
+ void emitEndOfAsmFile(Module &M) override;
bool isBlockOnlyReachableByFallthrough(
const MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 59aa0ea98aa79..c9d25d4250d55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -438,7 +438,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
- Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
+ auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
@@ -447,9 +447,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
- CallInst *const Ballot = B.CreateIntrinsic(
- Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
- {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
+ CallInst *const Ballot =
+ B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index c657ca71bfdf4..05a4e3462a263 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUISelLowering.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
@@ -59,6 +60,18 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
} else
ExtReg = extendRegister(ValVReg, VA);
+ // If this is a scalar return, insert a readfirstlane just in case the value
+ // ends up in a VGPR.
+ // FIXME: Assert this is a shader return.
+ const SIRegisterInfo *TRI
+ = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+ if (TRI->isSGPRReg(MRI, PhysReg)) {
+ auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
+ {MRI.getType(ExtReg)}, false)
+ .addReg(ExtReg);
+ ExtReg = ToSGPR.getReg(0);
+ }
+
MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}
@@ -84,11 +97,10 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
auto &MFI = MIRBuilder.getMF().getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- Register AddrReg = MRI.createGenericVirtualRegister(
- LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32));
- MIRBuilder.buildFrameIndex(AddrReg, FI);
+ auto AddrReg = MIRBuilder.buildFrameIndex(
+ LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
StackUsed = std::max(StackUsed, Size + Offset);
- return AddrReg;
+ return AddrReg.getReg(0);
}
void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -119,9 +131,12 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
+ MachineFunction &MF = MIRBuilder.getMF();
+
// FIXME: Get alignment
- auto MMO = MIRBuilder.getMF().getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1);
+ auto MMO = MF.getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
@@ -150,10 +165,26 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI) {
}
+// FIXME: Compatability shim
+static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
+ switch (MIOpc) {
+ case TargetOpcode::G_SEXT:
+ return ISD::SIGN_EXTEND;
+ case TargetOpcode::G_ZEXT:
+ return ISD::ZERO_EXTEND;
+ case TargetOpcode::G_ANYEXT:
+ return ISD::ANY_EXTEND;
+ default:
+ llvm_unreachable("not an extend opcode");
+ }
+}
+
void AMDGPUCallLowering::splitToValueTypes(
- const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
- SplitArgTy PerformArgSplit) const {
+ MachineIRBuilder &B,
+ const ArgInfo &OrigArg, unsigned OrigArgIdx,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, CallingConv::ID CallConv,
+ SplitArgTy PerformArgSplit) const {
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
@@ -167,28 +198,46 @@ void AMDGPUCallLowering::splitToValueTypes(
int SplitIdx = 0;
for (EVT VT : SplitVTs) {
- unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+ Register Reg = OrigArg.Regs[SplitIdx];
Type *Ty = VT.getTypeForEVT(Ctx);
+ LLT LLTy = getLLTForType(*Ty, DL);
+ if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) {
+ unsigned ExtendOp = TargetOpcode::G_ANYEXT;
+ if (OrigArg.Flags[0].isSExt()) {
+ assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
+ ExtendOp = TargetOpcode::G_SEXT;
+ } else if (OrigArg.Flags[0].isZExt()) {
+ assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
+ ExtendOp = TargetOpcode::G_ZEXT;
+ }
+ EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
+ extOpcodeToISDExtOpcode(ExtendOp));
+ if (ExtVT != VT) {
+ VT = ExtVT;
+ Ty = ExtVT.getTypeForEVT(Ctx);
+ LLTy = getLLTForType(*Ty, DL);
+ Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
+ }
+ }
+
+ unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+ MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
if (NumParts == 1) {
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
- SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
- OrigArg.Flags, OrigArg.IsFixed);
+ SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
++SplitIdx;
continue;
}
- LLT LLTy = getLLTForType(*Ty, DL);
-
SmallVector<Register, 8> SplitRegs;
-
- EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
- Type *PartTy = PartVT.getTypeForEVT(Ctx);
+ Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
LLT PartLLT = getLLTForType(*PartTy, DL);
+ MachineRegisterInfo &MRI = *B.getMRI();
// FIXME: Should we be reporting all of the part registers for a single
// argument, and let handleAssignments take care of the repacking?
@@ -198,7 +247,7 @@ void AMDGPUCallLowering::splitToValueTypes(
SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
}
- PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
+ PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
++SplitIdx;
}
@@ -218,13 +267,11 @@ static LLT getMultipleType(LLT OrigTy, int Factor) {
static void unpackRegsToOrigType(MachineIRBuilder &B,
ArrayRef<Register> DstRegs,
Register SrcReg,
+ const CallLowering::ArgInfo &Info,
LLT SrcTy,
LLT PartTy) {
assert(DstRegs.size() > 1 && "Nothing to unpack");
- MachineFunction &MF = B.getMF();
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
const unsigned SrcSize = SrcTy.getSizeInBits();
const unsigned PartSize = PartTy.getSizeInBits();
@@ -248,12 +295,11 @@ static void unpackRegsToOrigType(MachineIRBuilder &B,
LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
auto ImpDef = B.buildUndef(BigTy);
- Register BigReg = MRI.createGenericVirtualRegister(BigTy);
- B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+ auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
int64_t Offset = 0;
for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
- B.buildExtract(DstRegs[i], BigReg, Offset);
+ B.buildExtract(DstRegs[i], Big, Offset);
}
/// Lower the return value for the already existing \p Ret. This assumes that
@@ -267,24 +313,26 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
auto &MF = B.getMF();
const auto &F = MF.getFunction();
const DataLayout &DL = MF.getDataLayout();
+ MachineRegisterInfo *MRI = B.getMRI();
CallingConv::ID CC = F.getCallingConv();
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
ArgInfo OrigRetInfo(VRegs, Val->getType());
setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
SmallVector<ArgInfo, 4> SplitRetInfos;
splitToValueTypes(
- OrigRetInfo, SplitRetInfos, DL, MRI, CC,
- [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
- unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
+ B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC,
+ [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
+ int VTSplitIdx) {
+ unpackRegsToOrigType(B, Regs, SrcReg,
+ SplitRetInfos[VTSplitIdx],
+ LLTy, PartLLT);
});
CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
-
- OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
+ OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
return handleAssignments(B, SplitRetInfos, RetHandler);
}
@@ -309,7 +357,7 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
return true;
}
- auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
+ auto const &ST = MF.getSubtarget<GCNSubtarget>();
unsigned ReturnOpc =
IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
@@ -348,22 +396,17 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
LLT PtrType = getLLTForType(*PtrTy, DL);
- Register DstReg = MRI.createGenericVirtualRegister(PtrType);
Register KernArgSegmentPtr =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
- Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
- B.buildConstant(OffsetReg, Offset);
+ auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
- B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
-
- return DstReg;
+ return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
}
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
- Type *ParamTy, uint64_t Offset,
- unsigned Align,
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
+ uint64_t Offset, Align Alignment,
Register DstReg) const {
MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
@@ -372,11 +415,11 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- TypeSize, Align);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ TypeSize, Alignment);
B.buildLoad(DstReg, PtrReg, *MMO);
}
@@ -389,19 +432,19 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
SIMachineFunctionInfo &Info) {
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info.hasPrivateSegmentBuffer()) {
- unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+ Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info.hasDispatchPtr()) {
- unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+ Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info.hasQueuePtr()) {
- unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+ Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
@@ -418,13 +461,13 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
}
if (Info.hasDispatchID()) {
- unsigned DispatchIDReg = Info.addDispatchID(TRI);
+ Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
if (Info.hasFlatScratchInit()) {
- unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+ Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
@@ -451,7 +494,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
unsigned i = 0;
- const unsigned KernArgBaseAlign = 16;
+ const Align KernArgBaseAlign(16);
const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
uint64_t ExplicitArgOffset = 0;
@@ -462,19 +505,24 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
if (AllocSize == 0)
continue;
- unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+ Align ABIAlign = DL.getABITypeAlign(ArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
+ if (Arg.use_empty()) {
+ ++i;
+ continue;
+ }
+
ArrayRef<Register> OrigArgRegs = VRegs[i];
Register ArgReg =
OrigArgRegs.size() == 1
? OrigArgRegs[0]
: MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
- unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
- ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
- lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
+
+ Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
+ lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
if (OrigArgRegs.size() > 1)
unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
++i;
@@ -485,38 +533,72 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
return true;
}
+/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
+static MachineInstrBuilder mergeVectorRegsToResultRegs(
+ MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT LLTy = MRI.getType(DstRegs[0]);
+ LLT PartLLT = MRI.getType(SrcRegs[0]);
+
+ // Deal with v3s16 split into v2s16
+ LLT LCMTy = getLCMType(LLTy, PartLLT);
+ if (LCMTy == LLTy) {
+ // Common case where no padding is needed.
+ assert(DstRegs.size() == 1);
+ return B.buildConcatVectors(DstRegs[0], SrcRegs);
+ }
+
+ const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
+ Register Undef = B.buildUndef(PartLLT).getReg(0);
+
+ // Build vector of undefs.
+ SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
+
+ // Replace the first sources with the real registers.
+ std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
+
+ auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
+ int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
+
+ SmallVector<Register, 8> PadDstRegs(NumDst);
+ std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
+
+ // Create the excess dead defs for the unmerge.
+ for (int I = DstRegs.size(); I != NumDst; ++I)
+ PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
+
+ return B.buildUnmerge(PadDstRegs, Widened);
+}
+
// TODO: Move this to generic code
static void packSplitRegsToOrigType(MachineIRBuilder &B,
ArrayRef<Register> OrigRegs,
ArrayRef<Register> Regs,
LLT LLTy,
LLT PartLLT) {
- if (!LLTy.isVector() && !PartLLT.isVector()) {
- B.buildMerge(OrigRegs[0], Regs);
- return;
- }
+ MachineRegisterInfo &MRI = *B.getMRI();
- if (LLTy.isVector() && PartLLT.isVector()) {
- assert(LLTy.getElementType() == PartLLT.getElementType());
+ if (!LLTy.isVector() && !PartLLT.isVector()) {
+ assert(OrigRegs.size() == 1);
+ LLT OrigTy = MRI.getType(OrigRegs[0]);
- int DstElts = LLTy.getNumElements();
- int PartElts = PartLLT.getNumElements();
- if (DstElts % PartElts == 0)
- B.buildConcatVectors(OrigRegs[0], Regs);
+ unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
+ if (SrcSize == OrigTy.getSizeInBits())
+ B.buildMerge(OrigRegs[0], Regs);
else {
- // Deal with v3s16 split into v2s16
- assert(PartElts == 2 && DstElts % 2 != 0);
- int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
-
- LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
- auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
- B.buildExtract(OrigRegs[0], RoundedConcat, 0);
+ auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
+ B.buildTrunc(OrigRegs[0], Widened);
}
return;
}
- MachineRegisterInfo &MRI = *B.getMRI();
+ if (LLTy.isVector() && PartLLT.isVector()) {
+ assert(OrigRegs.size() == 1);
+ assert(LLTy.getElementType() == PartLLT.getElementType());
+ mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
+ return;
+ }
assert(LLTy.isVector() && !PartLLT.isVector());
@@ -644,13 +726,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(
}
ArgInfo OrigArg(VRegs[Idx], Arg.getType());
- setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
+ const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
+ setArgFlags(OrigArg, OrigArgIdx, DL, F);
splitToValueTypes(
- OrigArg, SplitArgs, DL, MRI, CC,
+ B, OrigArg, OrigArgIdx, SplitArgs, DL, CC,
// FIXME: We should probably be passing multiple registers to
// handleAssignments to do this
- [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ [&](ArrayRef<Register> Regs, Register DstReg,
+ LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ assert(DstReg == VRegs[Idx][VTSplitIdx]);
packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
LLTy, PartLLT);
});
@@ -705,11 +790,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!MBB.empty())
B.setInstr(*MBB.begin());
+ if (!IsEntryFunc) {
+ // For the fixed ABI, pass workitem IDs in the last argument register.
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+ TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+ }
+
FormalArgHandler Handler(B, MRI, AssignFn);
if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
return false;
- if (!IsEntryFunc) {
+ if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
// Special inputs come after user arguments.
TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -719,8 +810,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
- CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
- CCInfo.AllocateReg(Info->getFrameOffsetReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 53a562586bc06..446619d1502ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -27,14 +27,16 @@ class AMDGPUCallLowering: public CallLowering {
uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
- unsigned Align, Register DstReg) const;
+ Align Alignment, Register DstReg) const;
/// A function of this type is used to perform value split action.
- using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>;
+ using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
- void splitToValueTypes(const ArgInfo &OrigArgInfo,
+ void splitToValueTypes(MachineIRBuilder &B,
+ const ArgInfo &OrigArgInfo,
+ unsigned OrigArgIdx,
SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI,
+ const DataLayout &DL,
CallingConv::ID CallConv,
SplitArgTy SplitArg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index f8a54a61aac22..7c83b6dcb44b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -18,7 +18,7 @@ class CCIfExtend<CCAction A>
// Calling convention for SI
def CC_SI : CallingConv<[
- CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -28,7 +28,7 @@ def CC_SI : CallingConv<[
]>>>,
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
- CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -50,7 +50,7 @@ def CC_SI : CallingConv<[
]>;
def RetCC_SI_Shader : CallingConv<[
- CCIfType<[i32] , CCAssignToReg<[
+ CCIfType<[i32, i16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
(sequence "VGPR%u", 32, 255)
>;
+def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
+ // The CSRs & scratch-registers are interleaved at a split boundary of 8.
+ (add (sequence "VGPR%u", 40, 47),
+ (sequence "VGPR%u", 56, 63),
+ (sequence "VGPR%u", 72, 79),
+ (sequence "VGPR%u", 88, 95),
+ (sequence "VGPR%u", 104, 111),
+ (sequence "VGPR%u", 120, 127),
+ (sequence "VGPR%u", 136, 143),
+ (sequence "VGPR%u", 152, 159),
+ (sequence "VGPR%u", 168, 175),
+ (sequence "VGPR%u", 184, 191),
+ (sequence "VGPR%u", 200, 207),
+ (sequence "VGPR%u", 216, 223),
+ (sequence "VGPR%u", 232, 239),
+ (sequence "VGPR%u", 248, 255))
+>;
+
def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
(sequence "SGPR%u", 32, 105)
>;
@@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
>;
def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
- (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
+ (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
>;
// Calling convention for leaf functions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index cf908766caa0d..a795493017402 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,8 +15,10 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -26,6 +28,7 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -41,6 +44,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
#include <cassert>
#include <iterator>
@@ -54,7 +58,7 @@ static cl::opt<bool> WidenLoads(
"amdgpu-codegenprepare-widen-constant-loads",
cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
cl::ReallyHidden,
- cl::init(true));
+ cl::init(false));
static cl::opt<bool> UseMul24Intrin(
"amdgpu-codegenprepare-mul24",
@@ -62,10 +66,26 @@ static cl::opt<bool> UseMul24Intrin(
cl::ReallyHidden,
cl::init(true));
+// Legalize 64-bit division by using the generic IR expansion.
+static cl::opt<bool> ExpandDiv64InIR(
+ "amdgpu-codegenprepare-expand-div64",
+ cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(false));
+
+// Leave all division operations as they are. This supersedes ExpandDiv64InIR
+// and is used for testing the legalizer.
+static cl::opt<bool> DisableIDivExpand(
+ "amdgpu-codegenprepare-disable-idiv-expansion",
+ cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(false));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNSubtarget *ST = nullptr;
AssumptionCache *AC = nullptr;
+ DominatorTree *DT = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
@@ -152,15 +172,33 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// SelectionDAG has an issue where an and asserting the bits are known
bool replaceMulWithMul24(BinaryOperator &I) const;
+ /// Perform same function as equivalently named function in DAGCombiner. Since
+ /// we expand some divisions here, we need to perform this before obscuring.
+ bool foldBinOpIntoSelect(BinaryOperator &I) const;
+
+ bool divHasSpecialOptimization(BinaryOperator &I,
+ Value *Num, Value *Den) const;
+ int getDivNumBits(BinaryOperator &I,
+ Value *Num, Value *Den,
+ unsigned AtLeast, bool Signed) const;
+
/// Expands 24 bit div or rem.
Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den,
bool IsDiv, bool IsSigned) const;
+ Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den, unsigned NumBits,
+ bool IsDiv, bool IsSigned) const;
+
/// Expands 32 bit div or rem.
Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den) const;
+ Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
+ Value *Num, Value *Den) const;
+ void expandDivRem64(BinaryOperator &I) const;
+
/// Widen a scalar load.
///
/// \details \p Widen scalar load for uniform, small type loads from constant
@@ -195,7 +233,10 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<LegacyDivergenceAnalysis>();
- AU.setPreservesAll();
+
+ // FIXME: Division expansion needs to preserve the dominator tree.
+ if (!ExpandDiv64InIR)
+ AU.setPreservesAll();
}
};
@@ -214,7 +255,7 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
if (T->isIntegerTy())
return B.getInt32Ty();
- return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+ return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
}
bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
@@ -276,10 +317,9 @@ bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
Type *Ty = I.getType();
const DataLayout &DL = Mod->getDataLayout();
int TySize = DL.getTypeSizeInBits(Ty);
- unsigned Align = I.getAlignment() ?
- I.getAlignment() : DL.getABITypeAlignment(Ty);
+ Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
- return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
+ return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
}
bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -436,7 +476,7 @@ bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
static void extractValues(IRBuilder<> &Builder,
SmallVectorImpl<Value *> &Values, Value *V) {
- VectorType *VT = dyn_cast<VectorType>(V->getType());
+ auto *VT = dyn_cast<FixedVectorType>(V->getType());
if (!VT) {
Values.push_back(V);
return;
@@ -525,58 +565,218 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
return true;
}
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
- const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
- if (!CNum)
- return HasDenormals;
+// Find a select instruction, which may have been casted. This is mostly to deal
+// with cases where i16 selects were promoted here to i32.
+static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
+ Cast = nullptr;
+ if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+ return Sel;
- if (UnsafeDiv)
- return true;
+ if ((Cast = dyn_cast<CastInst>(V))) {
+ if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
+ return Sel;
+ }
+
+ return nullptr;
+}
+
+bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
+ // Don't do this unless the old select is going away. We want to eliminate the
+ // binary operator, not replace a binop with a select.
+ int SelOpNo = 0;
+
+ CastInst *CastOp;
+
+ // TODO: Should probably try to handle some cases with multiple
+ // users. Duplicating the select may be profitable for division.
+ SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
+ if (!Sel || !Sel->hasOneUse()) {
+ SelOpNo = 1;
+ Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
+ }
+
+ if (!Sel || !Sel->hasOneUse())
+ return false;
+
+ Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
+ Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
+ Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
+ if (!CBO || !CT || !CF)
+ return false;
+
+ if (CastOp) {
+ if (!CastOp->hasOneUse())
+ return false;
+ CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
+ CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+ }
+
+ // TODO: Handle special 0/-1 cases DAG combine does, although we only really
+ // need to handle divisions here.
+ Constant *FoldedT = SelOpNo ?
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
+ if (isa<ConstantExpr>(FoldedT))
+ return false;
+
+ Constant *FoldedF = SelOpNo ?
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
+ ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
+ if (isa<ConstantExpr>(FoldedF))
+ return false;
+
+ IRBuilder<> Builder(&BO);
+ Builder.SetCurrentDebugLocation(BO.getDebugLoc());
+ if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
+ Builder.setFastMathFlags(FPOp->getFastMathFlags());
+
+ Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
+ FoldedT, FoldedF);
+ NewSelect->takeName(&BO);
+ BO.replaceAllUsesWith(NewSelect);
+ BO.eraseFromParent();
+ if (CastOp)
+ CastOp->eraseFromParent();
+ Sel->eraseFromParent();
+ return true;
+}
+
+// Optimize fdiv with rcp:
+//
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+// allowed with unsafe-fp-math or afn.
+//
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
+ bool RcpIsAccurate, IRBuilder<> &Builder,
+ Module *Mod) {
+
+ if (!AllowInaccurateRcp && !RcpIsAccurate)
+ return nullptr;
+
+ Type *Ty = Den->getType();
+ if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+ if (AllowInaccurateRcp || RcpIsAccurate) {
+ if (CLHS->isExactlyValue(1.0)) {
+ Function *Decl = Intrinsic::getDeclaration(
+ Mod, Intrinsic::amdgcn_rcp, Ty);
+
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+ // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+ // insert rsq intrinsic here.
+
+ // 1.0 / x -> rcp(x)
+ return Builder.CreateCall(Decl, { Den });
+ }
+
+ // Same as for 1.0, but expand the sign out of the constant.
+ if (CLHS->isExactlyValue(-1.0)) {
+ Function *Decl = Intrinsic::getDeclaration(
+ Mod, Intrinsic::amdgcn_rcp, Ty);
+
+ // -1.0 / x -> rcp (fneg x)
+ Value *FNeg = Builder.CreateFNeg(Den);
+ return Builder.CreateCall(Decl, { FNeg });
+ }
+ }
+ }
- bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+ if (AllowInaccurateRcp) {
+ Function *Decl = Intrinsic::getDeclaration(
+ Mod, Intrinsic::amdgcn_rcp, Ty);
- // Reciprocal f32 is handled separately without denormals.
- return HasDenormals ^ IsOne;
+ // Turn into multiply by the reciprocal.
+ // x / y -> x * (1.0 / y)
+ Value *Recip = Builder.CreateCall(Decl, { Den });
+ return Builder.CreateFMul(Num, Recip);
+ }
+ return nullptr;
+}
+
+// optimize with fdiv.fast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
+//
+// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
+static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+ bool HasDenormals, IRBuilder<> &Builder,
+ Module *Mod) {
+ // fdiv.fast can achieve 2.5 ULP accuracy.
+ if (ReqdAccuracy < 2.5f)
+ return nullptr;
+
+ // Only have fdiv.fast for f32.
+ Type *Ty = Den->getType();
+ if (!Ty->isFloatTy())
+ return nullptr;
+
+ bool NumIsOne = false;
+ if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+ if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+ NumIsOne = true;
+ }
+
+ // fdiv does not support denormals. But 1.0/x is always fine to use it.
+ if (HasDenormals && !NumIsOne)
+ return nullptr;
+
+ Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+ return Builder.CreateCall(Decl, { Num, Den });
}
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+// Optimizations is performed based on fpmath, fast math flags as well as
+// denormals to optimize fdiv with either rcp or fdiv.fast.
+//
+// With rcp:
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+// allowed with unsafe-fp-math or afn.
+//
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+//
+// With fdiv.fast:
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
+//
+// NOTE: rcp is the preference in cases that both are legal.
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
- Type *Ty = FDiv.getType();
- if (!Ty->getScalarType()->isFloatTy())
- return false;
+ Type *Ty = FDiv.getType()->getScalarType();
- MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
- if (!FPMath)
+ // No intrinsic for fdiv16 if target does not support f16.
+ if (Ty->isHalfTy() && !ST->has16BitInsts())
return false;
const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
- float ULP = FPOp->getFPAccuracy();
- if (ULP < 2.5f)
- return false;
+ const float ReqdAccuracy = FPOp->getFPAccuracy();
+ // Inaccurate rcp is allowed with unsafe-fp-math or afn.
FastMathFlags FMF = FPOp->getFastMathFlags();
- bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
- FMF.allowReciprocal();
+ const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
- // With UnsafeDiv node will be optimized to just rcp and mul.
- if (UnsafeDiv)
- return false;
+ // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+ // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+ // rcp_f64 is never accurate.
+ const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+ (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
- IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+ IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
Builder.setFastMathFlags(FMF);
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
- Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
Value *Num = FDiv.getOperand(0);
Value *Den = FDiv.getOperand(1);
Value *NewFDiv = nullptr;
-
- if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
NewFDiv = UndefValue::get(VT);
// FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -584,19 +784,25 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
Value *NumEltI = Builder.CreateExtractElement(Num, I);
Value *DenEltI = Builder.CreateExtractElement(Den, I);
- Value *NewElt;
-
- if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
+ // Try rcp first.
+ Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
+ RcpIsAccurate, Builder, Mod);
+ if (!NewElt) // Try fdiv.fast.
+ NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+ HasFP32Denormals, Builder, Mod);
+ if (!NewElt) // Keep the original.
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
- } else {
- NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
- }
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
}
- } else {
- if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
- NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+ } else { // Scalar FDiv.
+ // Try rcp first.
+ NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
+ Builder, Mod);
+ if (!NewFDiv) { // Try fdiv.fast.
+ NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+ Builder, Mod);
+ }
}
if (NewFDiv) {
@@ -631,31 +837,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
return getMul64(Builder, LHS, RHS).second;
}
-// The fractional part of a float is enough to accurately represent up to
-// a 24-bit signed integer.
-Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den,
- bool IsDiv, bool IsSigned) const {
- assert(Num->getType()->isIntegerTy(32));
-
+/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// an optimization hint to bypass the second ComputeNumSignBits call if we the
+/// first one is insufficient. Returns -1 on failure.
+int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
+ Value *Num, Value *Den,
+ unsigned AtLeast, bool IsSigned) const {
const DataLayout &DL = Mod->getDataLayout();
unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
- if (LHSSignBits < 9)
- return nullptr;
+ if (LHSSignBits < AtLeast)
+ return -1;
unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
- if (RHSSignBits < 9)
- return nullptr;
-
+ if (RHSSignBits < AtLeast)
+ return -1;
unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
- unsigned DivBits = 32 - SignBits;
+ unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
if (IsSigned)
++DivBits;
+ return DivBits;
+}
- Type *Ty = Num->getType();
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den,
+ bool IsDiv, bool IsSigned) const {
+ int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
+ if (DivBits == -1)
+ return nullptr;
+ return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den,
+ unsigned DivBits,
+ bool IsDiv, bool IsSigned) const {
Type *I32Ty = Builder.getInt32Ty();
+ Num = Builder.CreateTrunc(Num, I32Ty);
+ Den = Builder.CreateTrunc(Den, I32Ty);
+
Type *F32Ty = Builder.getFloatTy();
ConstantInt *One = Builder.getInt32(1);
Value *JQ = One;
@@ -685,7 +909,9 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
: Builder.CreateUIToFP(IB,F32Ty);
- Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+ Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
+ Builder.getFloatTy());
+ Value *RCP = Builder.CreateCall(RcpDecl, { FB });
Value *FQM = Builder.CreateFMul(FA, RCP);
// fq = trunc(fqm);
@@ -696,7 +922,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Value *FQNeg = Builder.CreateFNeg(FQ);
// float fr = mad(fqneg, fb, fa);
- Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+ auto FMAD = !ST->hasMadMacF32Insts()
+ ? Intrinsic::fma
+ : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
+ Value *FR = Builder.CreateIntrinsic(FMAD,
{FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
// int iq = (int)fq;
@@ -725,21 +954,72 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
Res = Builder.CreateSub(Num, Rem);
}
- // Truncate to number of bits this divide really is.
- if (IsSigned) {
- Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
- Res = Builder.CreateSExt(Res, Ty);
- } else {
- ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
- Res = Builder.CreateAnd(Res, TruncMask);
+ if (DivBits != 0 && DivBits < 32) {
+ // Extend in register from the number of bits this divide really is.
+ if (IsSigned) {
+ int InRegBits = 32 - DivBits;
+
+ Res = Builder.CreateShl(Res, InRegBits);
+ Res = Builder.CreateAShr(Res, InRegBits);
+ } else {
+ ConstantInt *TruncMask
+ = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+ Res = Builder.CreateAnd(Res, TruncMask);
+ }
}
return Res;
}
-Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den) const {
+// Try to recognize special cases the DAG will emit special, better expansions
+// than the general expansion we do here.
+
+// TODO: It would be better to just directly handle those optimizations here.
+bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
+ BinaryOperator &I, Value *Num, Value *Den) const {
+ if (Constant *C = dyn_cast<Constant>(Den)) {
+ // Arbitrary constants get a better expansion as long as a wider mulhi is
+ // legal.
+ if (C->getType()->getScalarSizeInBits() <= 32)
+ return true;
+
+ // TODO: Sdiv check for not exact for some reason.
+
+ // If there's no wider mulhi, there's only a better expansion for powers of
+ // two.
+ // TODO: Should really know for each vector element.
+ if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
+ return true;
+
+ return false;
+ }
+
+ if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
+ // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+ if (BinOpDen->getOpcode() == Instruction::Shl &&
+ isa<Constant>(BinOpDen->getOperand(0)) &&
+ isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
+ 0, AC, &I, DT)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
+ // Check whether the sign can be determined statically.
+ KnownBits Known = computeKnownBits(V, *DL);
+ if (Known.isNegative())
+ return Constant::getAllOnesValue(V->getType());
+ if (Known.isNonNegative())
+ return Constant::getNullValue(V->getType());
+ return Builder.CreateAShr(V, Builder.getInt32(31));
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+ BinaryOperator &I, Value *X,
+ Value *Y) const {
Instruction::BinaryOps Opc = I.getOpcode();
assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
Opc == Instruction::SRem || Opc == Instruction::SDiv);
@@ -748,142 +1028,171 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
FMF.setFast();
Builder.setFastMathFlags(FMF);
- if (isa<Constant>(Den))
- return nullptr; // Keep it for optimization
+ if (divHasSpecialOptimization(I, X, Y))
+ return nullptr; // Keep it for later optimization.
bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
- Type *Ty = Num->getType();
+ Type *Ty = X->getType();
Type *I32Ty = Builder.getInt32Ty();
Type *F32Ty = Builder.getFloatTy();
if (Ty->getScalarSizeInBits() < 32) {
if (IsSigned) {
- Num = Builder.CreateSExt(Num, I32Ty);
- Den = Builder.CreateSExt(Den, I32Ty);
+ X = Builder.CreateSExt(X, I32Ty);
+ Y = Builder.CreateSExt(Y, I32Ty);
} else {
- Num = Builder.CreateZExt(Num, I32Ty);
- Den = Builder.CreateZExt(Den, I32Ty);
+ X = Builder.CreateZExt(X, I32Ty);
+ Y = Builder.CreateZExt(Y, I32Ty);
}
}
- if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
- Res = Builder.CreateTrunc(Res, Ty);
- return Res;
+ if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
+ return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
+ Builder.CreateZExtOrTrunc(Res, Ty);
}
ConstantInt *Zero = Builder.getInt32(0);
ConstantInt *One = Builder.getInt32(1);
- ConstantInt *MinusOne = Builder.getInt32(~0);
Value *Sign = nullptr;
if (IsSigned) {
- ConstantInt *K31 = Builder.getInt32(31);
- Value *LHSign = Builder.CreateAShr(Num, K31);
- Value *RHSign = Builder.CreateAShr(Den, K31);
+ Value *SignX = getSign32(X, Builder, DL);
+ Value *SignY = getSign32(Y, Builder, DL);
// Remainder sign is the same as LHS
- Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+ Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
- Num = Builder.CreateAdd(Num, LHSign);
- Den = Builder.CreateAdd(Den, RHSign);
+ X = Builder.CreateAdd(X, SignX);
+ Y = Builder.CreateAdd(Y, SignY);
- Num = Builder.CreateXor(Num, LHSign);
- Den = Builder.CreateXor(Den, RHSign);
+ X = Builder.CreateXor(X, SignX);
+ Y = Builder.CreateXor(Y, SignY);
}
- // RCP = URECIP(Den) = 2^32 / Den + e
- // e is rounding error.
- Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
- Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
- Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
- Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
- Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
-
- // RCP_LO, RCP_HI = mul(RCP, Den) */
- Value *RCP_LO, *RCP_HI;
- std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
-
- // NEG_RCP_LO = -RCP_LO
- Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
-
- // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
- Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
- Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
-
- // Calculate the rounding error from the URECIP instruction
- // E = mulhu(ABS_RCP_LO, RCP)
- Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
-
- // RCP_A_E = RCP + E
- Value *RCP_A_E = Builder.CreateAdd(RCP, E);
-
- // RCP_S_E = RCP - E
- Value *RCP_S_E = Builder.CreateSub(RCP, E);
-
- // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
- Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
-
- // Quotient = mulhu(Tmp0, Num)
- Value *Quotient = getMulHu(Builder, Tmp0, Num);
-
- // Num_S_Remainder = Quotient * Den
- Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+ // The algorithm here is based on ideas from "Software Integer Division", Tom
+ // Rodeheffer, August 2008.
+ //
+ // unsigned udiv(unsigned x, unsigned y) {
+ // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
+ // // that this is a lower bound on inv(y), even if some of the calculations
+ // // round up.
+ // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
+ //
+ // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
+ // // Empirically this is guaranteed to give a "two-y" lower bound on
+ // // inv(y).
+ // z += umulh(z, -y * z);
+ //
+ // // Quotient/remainder estimate.
+ // unsigned q = umulh(x, z);
+ // unsigned r = x - q * y;
+ //
+ // // Two rounds of quotient/remainder refinement.
+ // if (r >= y) {
+ // ++q;
+ // r -= y;
+ // }
+ // if (r >= y) {
+ // ++q;
+ // r -= y;
+ // }
+ //
+ // return q;
+ // }
+
+ // Initial estimate of inv(y).
+ Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
+ Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
+ Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
+ Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
+ Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
+ Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
+
+ // One round of UNR.
+ Value *NegY = Builder.CreateSub(Zero, Y);
+ Value *NegYZ = Builder.CreateMul(NegY, Z);
+ Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
+
+ // Quotient/remainder estimate.
+ Value *Q = getMulHu(Builder, X, Z);
+ Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
+
+ // First quotient/remainder refinement.
+ Value *Cond = Builder.CreateICmpUGE(R, Y);
+ if (IsDiv)
+ Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+ R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
+
+ // Second quotient/remainder refinement.
+ Cond = Builder.CreateICmpUGE(R, Y);
+ Value *Res;
+ if (IsDiv)
+ Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+ else
+ Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
- // Remainder = Num - Num_S_Remainder
- Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+ if (IsSigned) {
+ Res = Builder.CreateXor(Res, Sign);
+ Res = Builder.CreateSub(Res, Sign);
+ }
- // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
- Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
- Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+ Res = Builder.CreateTrunc(Res, Ty);
- // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
- Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
- Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
- MinusOne, Zero);
+ return Res;
+}
- // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
- Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
- Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
+ BinaryOperator &I,
+ Value *Num, Value *Den) const {
+ if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
+ return nullptr; // Keep it for later optimization.
- Value *Res;
- if (IsDiv) {
- // Quotient_A_One = Quotient + 1
- Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+ Instruction::BinaryOps Opc = I.getOpcode();
- // Quotient_S_One = Quotient - 1
- Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+ bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
+ bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
- // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
- Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+ int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+ if (NumDivBits == -1)
+ return nullptr;
- // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
- Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
- } else {
- // Remainder_S_Den = Remainder - Den
- Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+ Value *Narrowed = nullptr;
+ if (NumDivBits <= 24) {
+ Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
+ IsDiv, IsSigned);
+ } else if (NumDivBits <= 32) {
+ Narrowed = expandDivRem32(Builder, I, Num, Den);
+ }
- // Remainder_A_Den = Remainder + Den
- Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+ if (Narrowed) {
+ return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
+ Builder.CreateZExt(Narrowed, Num->getType());
+ }
- // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
- Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+ return nullptr;
+}
- // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
- Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
+ Instruction::BinaryOps Opc = I.getOpcode();
+ // Do the general expansion.
+ if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
+ expandDivisionUpTo64Bits(&I);
+ return;
}
- if (IsSigned) {
- Res = Builder.CreateXor(Res, Sign);
- Res = Builder.CreateSub(Res, Sign);
+ if (Opc == Instruction::URem || Opc == Instruction::SRem) {
+ expandRemainderUpTo64Bits(&I);
+ return;
}
- Res = Builder.CreateTrunc(Res, Ty);
-
- return Res;
+ llvm_unreachable("not a division");
}
bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+ if (foldBinOpIntoSelect(I))
+ return true;
+
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
@@ -895,27 +1204,54 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
Instruction::BinaryOps Opc = I.getOpcode();
Type *Ty = I.getType();
Value *NewDiv = nullptr;
+ unsigned ScalarSize = Ty->getScalarSizeInBits();
+
+ SmallVector<BinaryOperator *, 8> Div64ToExpand;
+
if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
- Ty->getScalarSizeInBits() <= 32) {
+ ScalarSize <= 64 &&
+ !DisableIDivExpand) {
Value *Num = I.getOperand(0);
Value *Den = I.getOperand(1);
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
- if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
NewDiv = UndefValue::get(VT);
for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
Value *NumEltN = Builder.CreateExtractElement(Num, N);
Value *DenEltN = Builder.CreateExtractElement(Den, N);
- Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
- if (!NewElt)
- NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+
+ Value *NewElt;
+ if (ScalarSize <= 32) {
+ NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+ if (!NewElt)
+ NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+ } else {
+ // See if this 64-bit division can be shrunk to 32/24-bits before
+ // producing the general expansion.
+ NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
+ if (!NewElt) {
+ // The general 64-bit expansion introduces control flow and doesn't
+ // return the new value. Just insert a scalar copy and defer
+ // expanding it.
+ NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+ Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
+ }
+ }
+
NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
}
} else {
- NewDiv = expandDivRem32(Builder, I, Num, Den);
+ if (ScalarSize <= 32)
+ NewDiv = expandDivRem32(Builder, I, Num, Den);
+ else {
+ NewDiv = shrinkDivRem64(Builder, I, Num, Den);
+ if (!NewDiv)
+ Div64ToExpand.push_back(&I);
+ }
}
if (NewDiv) {
@@ -925,6 +1261,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
}
}
+ if (ExpandDiv64InIR) {
+ // TODO: We get much worse code in specially handled constant cases.
+ for (BinaryOperator *Div : Div64ToExpand) {
+ expandDivRem64(*Div);
+ Changed = true;
+ }
+ }
+
return Changed;
}
@@ -1033,16 +1377,36 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
ST = &TM.getSubtarget<GCNSubtarget>(F);
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
HasUnsafeFPMath = hasUnsafeFPMath(F);
- HasFP32Denormals = ST->hasFP32Denormals(F);
+
+ AMDGPU::SIModeRegisterDefaults Mode(F);
+ HasFP32Denormals = Mode.allFP32Denormals();
bool MadeChange = false;
- for (BasicBlock &BB : F) {
+ Function::iterator NextBB;
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
+ BasicBlock *BB = &*FI;
+ NextBB = std::next(FI);
+
BasicBlock::iterator Next;
- for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
Next = std::next(I);
+
MadeChange |= visit(*I);
+
+ if (Next != E) { // Control flow changed
+ BasicBlock *NextInstBB = Next->getParent();
+ if (NextInstBB != BB) {
+ BB = NextInstBB;
+ E = BB->end();
+ FE = F.end();
+ }
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
new file mode 100644
index 0000000000000..faaf9168d0dd8
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -0,0 +1,69 @@
+//=- AMDGPUCombine.td - Define AMDGPU Combine Rules ----------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+// TODO: This really belongs after legalization after scalarization.
+// TODO: GICombineRules should accept subtarget predicates
+
+def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">;
+
+def fcmp_select_to_fmin_fmax_legacy : GICombineRule<
+ (defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SELECT):$select,
+ [{ return matchFMinFMaxLegacy(*${select}, MRI, *MF, ${matchinfo}); }]),
+ (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
+
+
+def uchar_to_float : GICombineRule<
+ (defs root:$itofp),
+ (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,
+ [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]),
+ (apply [{ applyUCharToFloat(*${itofp}); }])>;
+
+def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
+
+def cvt_f32_ubyteN : GICombineRule<
+ (defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
+ (match (wip_match_opcode G_AMDGPU_CVT_F32_UBYTE0,
+ G_AMDGPU_CVT_F32_UBYTE1,
+ G_AMDGPU_CVT_F32_UBYTE2,
+ G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
+ [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]),
+ (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+
+// Combines which should only apply on SI/VI
+def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
+
+
+def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
+ "AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
+ elide_br_by_inverting_cond]> {
+ let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
+}
+
+
+// FIXME: combines_for_extload can introduce illegal extloads which
+// aren't re-legalized.
+// FIXME: Is there a way to remove a single item from all_combines?
+def all_combines_minus_extload : GICombineGroup<[trivial_combines,
+ ptr_add_immed_chain, combine_indexed_load_store, undef_combines,
+ identity_combines]
+>;
+
+def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
+ "AMDGPUGenPostLegalizerCombinerHelper",
+ [all_combines_minus_extload, gfx6gfx7_combines,
+ uchar_to_float, cvt_f32_ubyteN]> {
+ let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
+}
+
+def AMDGPURegBankCombinerHelper : GICombinerHelper<
+ "AMDGPUGenRegBankCombinerHelper", []> {
+ let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
new file mode 100644
index 0000000000000..25c82ed61fc2e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -0,0 +1,150 @@
+//===--- AMDGPUExportClusting.cpp - AMDGPU Export Clustering -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to cluster shader
+/// exports.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUExportClustering.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class ExportClustering : public ScheduleDAGMutation {
+public:
+ ExportClustering() {}
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+static bool isExport(const SUnit &SU) {
+ const MachineInstr *MI = SU.getInstr();
+ return MI->getOpcode() == AMDGPU::EXP ||
+ MI->getOpcode() == AMDGPU::EXP_DONE;
+}
+
+static bool isPositionExport(const SIInstrInfo *TII, SUnit *SU) {
+ const MachineInstr *MI = SU->getInstr();
+ int Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
+ return Imm >= 12 && Imm <= 15;
+}
+
+static void sortChain(const SIInstrInfo *TII, SmallVector<SUnit *, 8> &Chain,
+ unsigned PosCount) {
+ if (!PosCount || PosCount == Chain.size())
+ return;
+
+ // Position exports should occur as soon as possible in the shader
+ // for optimal performance. This moves position exports before
+ // other exports while preserving the order within different export
+ // types (pos or other).
+ SmallVector<SUnit *, 8> Copy(Chain);
+ unsigned PosIdx = 0;
+ unsigned OtherIdx = PosCount;
+ for (SUnit *SU : Copy) {
+ if (isPositionExport(TII, SU))
+ Chain[PosIdx++] = SU;
+ else
+ Chain[OtherIdx++] = SU;
+ }
+}
+
+static void buildCluster(ArrayRef<SUnit *> Exports, ScheduleDAGInstrs *DAG) {
+ SUnit *ChainHead = Exports.front();
+
+ // Now construct cluster from chain by adding new edges.
+ for (unsigned Idx = 0, End = Exports.size() - 1; Idx < End; ++Idx) {
+ SUnit *SUa = Exports[Idx];
+ SUnit *SUb = Exports[Idx + 1];
+
+ // Copy all dependencies to the head of the chain to avoid any
+ // computation being inserted into the chain.
+ for (const SDep &Pred : SUb->Preds) {
+ SUnit *PredSU = Pred.getSUnit();
+ if (!isExport(*PredSU) && !Pred.isWeak())
+ DAG->addEdge(ChainHead, SDep(PredSU, SDep::Artificial));
+ }
+
+ // New barrier edge ordering exports
+ DAG->addEdge(SUb, SDep(SUa, SDep::Barrier));
+ // Also add cluster edge
+ DAG->addEdge(SUb, SDep(SUa, SDep::Cluster));
+ }
+}
+
+static void removeExportDependencies(ScheduleDAGInstrs *DAG, SUnit &SU) {
+ SmallVector<SDep, 2> ToAdd, ToRemove;
+
+ for (const SDep &Pred : SU.Preds) {
+ SUnit *PredSU = Pred.getSUnit();
+ if (Pred.isBarrier() && isExport(*PredSU)) {
+ ToRemove.push_back(Pred);
+ if (isExport(SU))
+ continue;
+
+ // If we remove a barrier we need to copy dependencies
+ // from the predecessor to maintain order.
+ for (const SDep &ExportPred : PredSU->Preds) {
+ SUnit *ExportPredSU = ExportPred.getSUnit();
+ if (ExportPred.isBarrier() && !isExport(*ExportPredSU))
+ ToAdd.push_back(SDep(ExportPredSU, SDep::Barrier));
+ }
+ }
+ }
+
+ for (SDep Pred : ToRemove)
+ SU.removePred(Pred);
+ for (SDep Pred : ToAdd)
+ DAG->addEdge(&SU, Pred);
+}
+
+void ExportClustering::apply(ScheduleDAGInstrs *DAG) {
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+ SmallVector<SUnit *, 8> Chain;
+
+ // Pass through DAG gathering a list of exports and removing barrier edges
+ // creating dependencies on exports. Freeing exports of successor edges
+ // allows more scheduling freedom, and nothing should be order dependent
+ // on exports. Edges will be added later to order the exports.
+ unsigned PosCount = 0;
+ for (SUnit &SU : DAG->SUnits) {
+ if (!isExport(SU))
+ continue;
+
+ Chain.push_back(&SU);
+ if (isPositionExport(TII, &SU))
+ PosCount++;
+
+ removeExportDependencies(DAG, SU);
+
+ SmallVector<SDep, 4> Succs(SU.Succs);
+ for (SDep Succ : Succs)
+ removeExportDependencies(DAG, *Succ.getSUnit());
+ }
+
+ // Apply clustering if there are multiple exports
+ if (Chain.size() > 1) {
+ sortChain(TII, Chain, PosCount);
+ buildCluster(Chain, DAG);
+ }
+}
+
+} // end namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation() {
+ return std::make_unique<ExportClustering>();
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
new file mode 100644
index 0000000000000..58491d0671e4c
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -0,0 +1,15 @@
+//===- AMDGPUExportClustering.h - AMDGPU Export Clustering ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation();
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index ea3952c316e4d..db00f8f711a33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -18,15 +18,6 @@ def FeatureFMA : SubtargetFeature<"fmaf",
"Enable single precision FMA (not as fast as mul+add, but fused)"
>;
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
- "FP32Denormals",
- "true",
- "Enable single precision denormal handling"
->;
-
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
"localmemorysize"#Value,
"LocalMemorySize",
@@ -38,16 +29,16 @@ def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
- "wavefrontsize"#Value,
- "WavefrontSize",
- !cast<string>(Value),
+class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
+ "wavefrontsize"#!shl(1, ValueLog2),
+ "WavefrontSizeLog2",
+ !cast<string>(ValueLog2),
"The number of threads per wavefront"
>;
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<4>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<5>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<6>;
class SubtargetFeatureGeneration <string Value, string FeatureName,
string Subtarget,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
index 9ba04d113c702..ea6c6d0fd212b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
using namespace llvm;
@@ -31,12 +32,13 @@ class AMDGPUFixFunctionBitcasts final
bool Modified;
public:
- void visitCallSite(CallSite CS) {
- if (CS.getCalledFunction())
+ void visitCallBase(CallBase &CB) {
+ if (CB.getCalledFunction())
return;
- auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
- if (Callee && isLegalToPromote(CS, Callee)) {
- promoteCall(CS, Callee);
+ auto *Callee =
+ dyn_cast<Function>(CB.getCalledOperand()->stripPointerCasts());
+ if (Callee && isLegalToPromote(CB, Callee)) {
+ promoteCall(CB, Callee);
Modified = true;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 92e256cf2829f..260a18e278cf2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -26,7 +26,7 @@ namespace llvm {
class AMDGPUFrameLowering : public TargetFrameLowering {
public:
AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO,
- Align TransAl = Align::None());
+ Align TransAl = Align(1));
~AMDGPUFrameLowering() override;
/// \returns The number of 32-bit sub-registers that are used when storing
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index d420aa02ac28b..3f12addbcc79b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
include "AMDGPU.td"
+include "AMDGPUCombine.td"
def sd_vsrc0 : ComplexPattern<i32, 1, "">;
def gi_vsrc0 :
@@ -30,6 +31,10 @@ def gi_vop3mods :
GIComplexOperandMatcher<s32, "selectVOP3Mods">,
GIComplexPatternEquiv<VOP3Mods>;
+def gi_vop3_no_mods :
+ GIComplexOperandMatcher<s32, "selectVOP3NoMods">,
+ GIComplexPatternEquiv<VOP3NoMods>;
+
def gi_vop3mods_nnan :
GIComplexOperandMatcher<s32, "selectVOP3Mods_nnan">,
GIComplexPatternEquiv<VOP3Mods_nnan>;
@@ -38,9 +43,9 @@ def gi_vop3omods :
GIComplexOperandMatcher<s32, "selectVOP3OMods">,
GIComplexPatternEquiv<VOP3OMods>;
-def gi_vop3opselmods0 :
- GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">,
- GIComplexPatternEquiv<VOP3OpSelMods0>;
+def gi_vop3pmods :
+ GIComplexOperandMatcher<s32, "selectVOP3PMods">,
+ GIComplexPatternEquiv<VOP3PMods>;
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
@@ -83,6 +88,33 @@ def gi_ds_1addr_1offset :
GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
GIComplexPatternEquiv<DS1Addr1Offset>;
+def gi_ds_64bit_4byte_aligned :
+ GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
+ GIComplexPatternEquiv<DS64Bit4ByteAligned>;
+
+def gi_mubuf_addr64 :
+ GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
+ GIComplexPatternEquiv<MUBUFAddr64>;
+
+def gi_mubuf_offset :
+ GIComplexOperandMatcher<s64, "selectMUBUFOffset">,
+ GIComplexPatternEquiv<MUBUFOffset>;
+
+def gi_mubuf_addr64_atomic :
+ GIComplexOperandMatcher<s64, "selectMUBUFAddr64Atomic">,
+ GIComplexPatternEquiv<MUBUFAddr64Atomic>;
+
+def gi_mubuf_offset_atomic :
+ GIComplexOperandMatcher<s64, "selectMUBUFOffsetAtomic">,
+ GIComplexPatternEquiv<MUBUFOffsetAtomic>;
+
+def gi_smrd_buffer_imm :
+ GIComplexOperandMatcher<s64, "selectSMRDBufferImm">,
+ GIComplexPatternEquiv<SMRDBufferImm>;
+
+def gi_smrd_buffer_imm32 :
+ GIComplexOperandMatcher<s64, "selectSMRDBufferImm32">,
+ GIComplexPatternEquiv<SMRDBufferImm32>;
// Separate load nodes are defined to glue m0 initialization in
// SelectionDAG. The GISel selector can just insert m0 initialization
@@ -116,9 +148,54 @@ def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>;
def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>;
def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
-def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
+def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32_impl>;
+def : GINodeEquiv<G_AMDGPU_FMIN_LEGACY, AMDGPUfmin_legacy>;
+def : GINodeEquiv<G_AMDGPU_FMAX_LEGACY, AMDGPUfmax_legacy>;
+def : GINodeEquiv<G_AMDGPU_RCP_IFLAG, AMDGPUrcp_iflag>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE0, AMDGPUcvt_f32_ubyte0>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
+
+def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT, SItbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT_D16, SItbuffer_load_d16>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE, SIbuffer_store>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_SHORT, SIbuffer_store_short>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_BYTE, SIbuffer_store_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT, SIbuffer_store_format>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT_D16, SIbuffer_store_format_d16>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT, SItbuffer_store>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
+
+// FIXME: Check MMO is atomic
+def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>;
+
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB, SIbuffer_atomic_sub>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMIN, SIbuffer_atomic_smin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMIN, SIbuffer_atomic_umin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMAX, SIbuffer_atomic_smax>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMAX, SIbuffer_atomic_umax>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_AND, SIbuffer_atomic_and>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
class GISelSop2Pat <
SDPatternOperator node,
@@ -188,16 +265,13 @@ multiclass GISelVop2IntrPat <
def : GISelVop2Pat <node, inst, dst_vt, src_vt>;
- // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit
+ // FIXME: Intrinsics aren't marked as commutable, so we need to add an explicit
// pattern to handle commuting. This is another reason why legalizing to a
// generic machine instruction may be better that matching the intrinsic
// directly.
def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>;
}
-def : GISelSop2Pat <or, S_OR_B32, i32>;
-def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
-
// Since GlobalISel is more flexible then SelectionDAG, I think we can get
// away with adding patterns for integer types and not legalizing all
// loads and stores to vector types. This should help simplify the load/store
@@ -206,12 +280,18 @@ foreach Ty = [i64, p0, p1, p4] in {
defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
}
-def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">,
GISDNodeXFormEquiv<as_i32timm>;
-def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">,
+def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">,
GISDNodeXFormEquiv<as_i16timm>;
+def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm8">,
+ GISDNodeXFormEquiv<as_i8timm>;
+
+def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">,
+ GISDNodeXFormEquiv<as_i1timm>;
+
def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
GISDNodeXFormEquiv<NegateImm>;
@@ -220,3 +300,15 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
GISDNodeXFormEquiv<IMMPopCount>;
+
+def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">,
+ GISDNodeXFormEquiv<extract_glc>;
+
+def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">,
+ GISDNodeXFormEquiv<extract_slc>;
+
+def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
+ GISDNodeXFormEquiv<extract_dlc>;
+
+def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
+ GISDNodeXFormEquiv<extract_swz>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 2e92ae51660b7..600b351f9ea1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -132,7 +132,8 @@ const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
};
-// For some instructions which can operate 64-bit only for the scalar version.
+// For some instructions which can operate 64-bit only for the scalar
+// version. Otherwise, these need to be split into 2 32-bit operations.
const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
/*32-bit sgpr*/ {&SGPROnly64BreakDown[0], 1},
/*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2},
@@ -207,75 +208,16 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
return &ValMappingsSGPR64OnlyVGPR32[2];
}
-const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
- /* 256-bit load */ {0, 256, SGPRRegBank},
- /* 512-bit load */ {0, 512, SGPRRegBank},
- /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
- {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
- {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
- {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
- /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
- {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
- {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
- {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
- {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
- {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
- {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
- {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
- /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
- {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
- /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
- {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
- {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
- {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},
-
- /* FIXME: The generic register bank select does not support complex
- * break downs where the number of vector elements does not equal the
- * number of breakdowns.
- * FIXME: register bank select now tries to handle complex break downs,
- * but it emits an illegal instruction:
- * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
- */
- /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
- /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
- {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
-};
-
-const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
- /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1},
- /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1},
- /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8},
- /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16},
- /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4},
- /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8}
-};
-
-const RegisterBankInfo::ValueMapping *
-getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
- unsigned Size = SizeTy.getSizeInBits();
- if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
- return getValueMapping(BankID, Size);
-
- assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);
-
- // Default to using the non-split ValueMappings, we will use these if
- // the register bank is SGPR or if we don't know how to handle the vector
- // type.
- unsigned Idx = Size == 256 ? 0 : 1;
-
- // We need to split this load if it has a vgpr pointer.
- if (BankID == AMDGPU::VGPRRegBankID) {
- if (SizeTy == LLT::vector(8, 32))
- Idx = 2;
- else if (SizeTy == LLT::vector(16, 32))
- Idx = 3;
- else if (SizeTy == LLT::vector(4, 64))
- Idx = 4;
- else if (SizeTy == LLT::vector(8, 64))
- Idx = 5;
- }
+/// Split any 64-bit value into 2 32-bit pieces. Unlike
+/// getValueMappingSGPR64Only, this splits both VGPRs and SGPRs.
+const RegisterBankInfo::ValueMapping *getValueMappingSplit64(unsigned BankID,
+ unsigned Size) {
+ assert(Size == 64);
+ if (BankID == AMDGPU::VGPRRegBankID)
+ return &ValMappingsSGPR64OnlyVGPR32[4];
- return &ValMappingsLoadSGPROnly[Idx];
+ assert(BankID == AMDGPU::SGPRRegBankID);
+ return &ValMappingsSGPR64OnlyVGPR32[1];
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 16d7f2c4f9e59..989937a597fb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -43,3 +43,12 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
return std::make_tuple(Reg, 0, Def);
}
+
+bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
+ assert(Mask.size() == 2);
+
+ // If one half is undef, the other is trivially in the same reg.
+ if (Mask[0] == -1 || Mask[1] == -1)
+ return true;
+ return (Mask[0] & 2) == (Mask[1] & 2);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 1507ade795479..766750758efc2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
+#include "AMDGPUInstrInfo.h"
#include "llvm/CodeGen/Register.h"
#include <tuple>
@@ -23,6 +24,38 @@ namespace AMDGPU {
std::tuple<Register, unsigned, MachineInstr *>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
+bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
+
+/// Return number of address arguments, and the number of gradients for an image
+/// intrinsic.
+inline std::pair<int, int>
+getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
+ const AMDGPU::MIMGDimInfo *DimInfo
+ = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
+
+ int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
+ int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
+ int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
+ int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
+ return {NumVAddr, NumGradients};
+}
+
+/// Return index of dmask in an gMIR image intrinsic
+inline int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
+ int NumDefs) {
+ assert(!BaseOpcode->Atomic);
+ return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
+}
+
+/// Return first address operand index in a gMIR image intrinsic.
+inline int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
+ int NumDefs) {
+ if (BaseOpcode->Atomic)
+ return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
+ return getDMaskIdx(BaseOpcode, NumDefs) + 1;
+}
+
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 511d62943189d..c6f6a3b84e367 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -47,7 +47,7 @@ void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
HSAMD::Metadata FromHSAMetadataString;
- if (fromString(HSAMetadataString, FromHSAMetadataString)) {
+ if (fromString(std::string(HSAMetadataString), FromHSAMetadataString)) {
errs() << "FAIL\n";
return;
}
@@ -127,38 +127,6 @@ ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
ValueKind::ByValue);
}
-ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const {
- switch (Ty->getTypeID()) {
- case Type::IntegerTyID: {
- auto Signed = !TypeName.startswith("u");
- switch (Ty->getIntegerBitWidth()) {
- case 8:
- return Signed ? ValueType::I8 : ValueType::U8;
- case 16:
- return Signed ? ValueType::I16 : ValueType::U16;
- case 32:
- return Signed ? ValueType::I32 : ValueType::U32;
- case 64:
- return Signed ? ValueType::I64 : ValueType::U64;
- default:
- return ValueType::Struct;
- }
- }
- case Type::HalfTyID:
- return ValueType::F16;
- case Type::FloatTyID:
- return ValueType::F32;
- case Type::DoubleTyID:
- return ValueType::F64;
- case Type::PointerTyID:
- return getValueType(Ty->getPointerElementType(), TypeName);
- case Type::VectorTyID:
- return getValueType(Ty->getVectorElementType(), TypeName);
- default:
- return ValueType::Struct;
- }
-}
-
std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
switch (Ty->getTypeID()) {
case Type::IntegerTyID: {
@@ -185,10 +153,10 @@ std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
return "float";
case Type::DoubleTyID:
return "double";
- case Type::VectorTyID: {
- auto VecTy = cast<VectorType>(Ty);
+ case Type::FixedVectorTyID: {
+ auto VecTy = cast<FixedVectorType>(Ty);
auto ElTy = VecTy->getElementType();
- auto NumElements = VecTy->getVectorNumElements();
+ auto NumElements = VecTy->getNumElements();
return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
}
default:
@@ -259,7 +227,8 @@ void MetadataStreamerV2::emitPrintf(const Module &Mod) {
for (auto Op : Node->operands())
if (Op->getNumOperands())
- Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
+ Printf.push_back(
+ std::string(cast<MDString>(Op->getOperand(0))->getString()));
}
void MetadataStreamerV2::emitKernelLanguage(const Function &Func) {
@@ -345,12 +314,11 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
Type *Ty = Arg.getType();
const DataLayout &DL = Func->getParent()->getDataLayout();
- unsigned PointeeAlign = 0;
+ MaybeAlign PointeeAlign;
if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- PointeeAlign = Arg.getParamAlignment();
- if (PointeeAlign == 0)
- PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+ PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
+ PtrTy->getElementType());
}
}
@@ -360,20 +328,19 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
ValueKind ValueKind,
- unsigned PointeeAlign, StringRef Name,
+ MaybeAlign PointeeAlign, StringRef Name,
StringRef TypeName,
StringRef BaseTypeName,
StringRef AccQual, StringRef TypeQual) {
HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
- Arg.mName = Name;
- Arg.mTypeName = TypeName;
+ Arg.mName = std::string(Name);
+ Arg.mTypeName = std::string(TypeName);
Arg.mSize = DL.getTypeAllocSize(Ty);
- Arg.mAlign = DL.getABITypeAlignment(Ty);
+ Arg.mAlign = DL.getABITypeAlign(Ty).value();
Arg.mValueKind = ValueKind;
- Arg.mValueType = getValueType(Ty, BaseTypeName);
- Arg.mPointeeAlign = PointeeAlign;
+ Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0;
if (auto PtrTy = dyn_cast<PointerType>(Ty))
Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace());
@@ -479,7 +446,7 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
HSAMetadata.mKernels.push_back(Kernel::Metadata());
auto &Kernel = HSAMetadata.mKernels.back();
- Kernel.mName = Func.getName();
+ Kernel.mName = std::string(Func.getName());
Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
emitKernelLanguage(Func);
emitKernelAttrs(Func);
@@ -573,38 +540,6 @@ StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
: "by_value");
}
-StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const {
- switch (Ty->getTypeID()) {
- case Type::IntegerTyID: {
- auto Signed = !TypeName.startswith("u");
- switch (Ty->getIntegerBitWidth()) {
- case 8:
- return Signed ? "i8" : "u8";
- case 16:
- return Signed ? "i16" : "u16";
- case 32:
- return Signed ? "i32" : "u32";
- case 64:
- return Signed ? "i64" : "u64";
- default:
- return "struct";
- }
- }
- case Type::HalfTyID:
- return "f16";
- case Type::FloatTyID:
- return "f32";
- case Type::DoubleTyID:
- return "f64";
- case Type::PointerTyID:
- return getValueType(Ty->getPointerElementType(), TypeName);
- case Type::VectorTyID:
- return getValueType(Ty->getVectorElementType(), TypeName);
- default:
- return "struct";
- }
-}
-
std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
switch (Ty->getTypeID()) {
case Type::IntegerTyID: {
@@ -631,10 +566,10 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
return "float";
case Type::DoubleTyID:
return "double";
- case Type::VectorTyID: {
- auto VecTy = cast<VectorType>(Ty);
+ case Type::FixedVectorTyID: {
+ auto VecTy = cast<FixedVectorType>(Ty);
auto ElTy = VecTy->getElementType();
- auto NumElements = VecTy->getVectorNumElements();
+ auto NumElements = VecTy->getNumElements();
return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
}
default:
@@ -767,12 +702,11 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
Type *Ty = Arg.getType();
const DataLayout &DL = Func->getParent()->getDataLayout();
- unsigned PointeeAlign = 0;
+ MaybeAlign PointeeAlign;
if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- PointeeAlign = Arg.getParamAlignment();
- if (PointeeAlign == 0)
- PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+ PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
+ PtrTy->getElementType());
}
}
@@ -785,7 +719,7 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
StringRef ValueKind, unsigned &Offset,
msgpack::ArrayDocNode Args,
- unsigned PointeeAlign, StringRef Name,
+ MaybeAlign PointeeAlign, StringRef Name,
StringRef TypeName,
StringRef BaseTypeName,
StringRef AccQual, StringRef TypeQual) {
@@ -796,16 +730,14 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
if (!TypeName.empty())
Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
auto Size = DL.getTypeAllocSize(Ty);
- auto Align = DL.getABITypeAlignment(Ty);
+ Align Alignment = DL.getABITypeAlign(Ty);
Arg[".size"] = Arg.getDocument()->getNode(Size);
- Offset = alignTo(Offset, Align);
+ Offset = alignTo(Offset, Alignment);
Arg[".offset"] = Arg.getDocument()->getNode(Offset);
Offset += Size;
Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true);
- Arg[".value_type"] =
- Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true);
if (PointeeAlign)
- Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign);
+ Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign->value());
if (auto PtrTy = dyn_cast<PointerType>(Ty))
if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 80ac8ca67bcd1..9534fffd228d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -20,6 +20,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/Alignment.h"
namespace llvm {
@@ -27,6 +28,7 @@ class AMDGPUTargetStreamer;
class Argument;
class DataLayout;
class Function;
+class MachineFunction;
class MDNode;
class Module;
struct SIProgramInfo;
@@ -65,8 +67,6 @@ private:
StringRef getValueKind(Type *Ty, StringRef TypeQual,
StringRef BaseTypeName) const;
- StringRef getValueType(Type *Ty, StringRef TypeName) const;
-
std::string getTypeName(Type *Ty, bool Signed) const;
msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const;
@@ -89,7 +89,7 @@ private:
void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
unsigned &Offset, msgpack::ArrayDocNode Args,
- unsigned PointeeAlign = 0, StringRef Name = "",
+ MaybeAlign PointeeAlign = None, StringRef Name = "",
StringRef TypeName = "", StringRef BaseTypeName = "",
StringRef AccQual = "", StringRef TypeQual = "");
@@ -133,8 +133,6 @@ private:
ValueKind getValueKind(Type *Ty, StringRef TypeQual,
StringRef BaseTypeName) const;
- ValueType getValueType(Type *Ty, StringRef TypeName) const;
-
std::string getTypeName(Type *Ty, bool Signed) const;
std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
@@ -159,10 +157,9 @@ private:
void emitKernelArg(const Argument &Arg);
void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
- unsigned PointeeAlign = 0,
- StringRef Name = "", StringRef TypeName = "",
- StringRef BaseTypeName = "", StringRef AccQual = "",
- StringRef TypeQual = "");
+ MaybeAlign PointeeAlign = None, StringRef Name = "",
+ StringRef TypeName = "", StringRef BaseTypeName = "",
+ StringRef AccQual = "", StringRef TypeQual = "");
void emitHiddenKernelArgs(const Function &Func);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2b6308dc1549e..aaf448346b533 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -16,7 +16,6 @@
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPUInstrInfo.h"
#include "AMDGPUPerfHintAnalysis.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -29,6 +28,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -252,7 +252,6 @@ private:
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
@@ -265,16 +264,10 @@ private:
SDValue &Clamp, SDValue &Omod) const;
bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Clamp) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Clamp) const;
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
- SDValue &Clamp) const;
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -286,7 +279,6 @@ private:
void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
- void SelectDIV_FMAS(SDNode *N);
void SelectMAD_64_32(SDNode *N);
void SelectFMA_W_CHAIN(SDNode *N);
void SelectFMUL_W_CHAIN(SDNode *N);
@@ -301,6 +293,7 @@ private:
void SelectATOMIC_CMP_SWAP(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
+ void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
void SelectINTRINSIC_WO_CHAIN(SDNode *N);
void SelectINTRINSIC_VOID(SDNode *N);
@@ -409,7 +402,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
}
#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
+ Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -655,29 +648,6 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
}
-static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
- switch (NumVectorElts) {
- case 1:
- return AMDGPU::SReg_32RegClassID;
- case 2:
- return AMDGPU::SReg_64RegClassID;
- case 3:
- return AMDGPU::SGPR_96RegClassID;
- case 4:
- return AMDGPU::SGPR_128RegClassID;
- case 5:
- return AMDGPU::SGPR_160RegClassID;
- case 8:
- return AMDGPU::SReg_256RegClassID;
- case 16:
- return AMDGPU::SReg_512RegClassID;
- case 32:
- return AMDGPU::SReg_1024RegClassID;
- }
-
- llvm_unreachable("invalid vector size");
-}
-
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
@@ -698,6 +668,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
// 1 = Vector Register Class
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+ bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
+ Triple::amdgcn;
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
@@ -707,7 +679,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
IsRegSeq = false;
break;
}
- unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
+ unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+ : R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
@@ -717,7 +690,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
- unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
+ unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+ : R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -742,7 +716,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
@@ -801,7 +776,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
assert(VT.getVectorElementType().bitsEq(MVT::i32));
- unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+ unsigned RegClassID =
+ SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
SelectBuildVector(N, RegClassID);
return;
}
@@ -874,10 +850,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectDIV_SCALE(N);
return;
}
- case AMDGPUISD::DIV_FMAS: {
- SelectDIV_FMAS(N);
- return;
- }
case AMDGPUISD::MAD_I64_I32:
case AMDGPUISD::MAD_U64_U32: {
SelectMAD_64_32(N);
@@ -1020,8 +992,14 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
- unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ static const unsigned OpcMap[2][2][2] = {
+ {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
+ {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}},
+ {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
+ {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
+
+ unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
+ unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
SDNode *AddLo;
if (!ConsumeCarry) {
@@ -1063,24 +1041,51 @@ void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
SDValue RHS = N->getOperand(1);
SDValue CI = N->getOperand(2);
- unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
- : AMDGPU::V_SUBB_U32_e64;
- CurDAG->SelectNodeTo(
- N, Opc, N->getVTList(),
- {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ if (N->isDivergent()) {
+ unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {LHS, RHS, CI,
+ CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ } else {
+ unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
+ : AMDGPU::S_SUB_CO_PSEUDO;
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
+ }
}
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
// carry out despite the _i32 name. These were renamed in VI to _U32.
// FIXME: We should probably rename the opcodes here.
- unsigned Opc = N->getOpcode() == ISD::UADDO ?
- AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ bool IsAdd = N->getOpcode() == ISD::UADDO;
+ bool IsVALU = N->isDivergent();
+
+ for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
+ ++UI)
+ if (UI.getUse().getResNo() == 1) {
+ if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
+ (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
+ IsVALU = true;
+ break;
+ }
+ }
+
+ if (IsVALU) {
+ unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+
+ CurDAG->SelectNodeTo(
+ N, Opc, N->getVTList(),
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ } else {
+ unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
+ : AMDGPU::S_USUBO_PSEUDO;
- CurDAG->SelectNodeTo(
- N, Opc, N->getVTList(),
- {N->getOperand(0), N->getOperand(1),
- CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+ CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
+ {N->getOperand(0), N->getOperand(1)});
+ }
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
@@ -1125,35 +1130,6 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
-void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) {
- const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
- const SIRegisterInfo *TRI = ST->getRegisterInfo();
-
- SDLoc SL(N);
- EVT VT = N->getValueType(0);
-
- assert(VT == MVT::f32 || VT == MVT::f64);
-
- unsigned Opc
- = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32;
-
- SDValue CarryIn = N->getOperand(3);
- // V_DIV_FMAS implicitly reads VCC.
- SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL,
- TRI->getVCC(), CarryIn, SDValue());
-
- SDValue Ops[10];
-
- SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
- SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
- SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-
- Ops[8] = VCC;
- Ops[9] = VCC.getValue(1);
-
- CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
-}
-
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
@@ -1343,6 +1319,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &TFE, SDValue &DLC,
SDValue &SWZ) const {
// Subtarget prefers to use flat instruction
+ // FIXME: This should be a pattern predicate and not reach here
if (Subtarget->useFlatForGlobal())
return false;
@@ -1438,6 +1415,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
+ // FIXME: This should be a pattern predicate and not reach here
if (!Subtarget->hasAddr64())
return false;
@@ -1475,6 +1453,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
}
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+ SDLoc DL(N);
const MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -1489,9 +1468,8 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
}
// If we don't know this private access is a local stack object, it needs to
- // be relative to the entry point's scratch wave offset register.
- return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
- MVT::i32));
+ // be relative to the entry point's scratch wave offset.
+ return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
}
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1506,22 +1484,26 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
- unsigned Imm = CAddr->getZExtValue();
-
- SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
- MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- DL, MVT::i32, HighBits);
- VAddr = SDValue(MovHighBits, 0);
-
- // In a call sequence, stores to the argument stack area are relative to the
- // stack pointer.
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
- unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
- Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
-
- SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
- ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
- return true;
+ int64_t Imm = CAddr->getSExtValue();
+ const int64_t NullPtr =
+ AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
+ // Don't fold null pointer.
+ if (Imm != NullPtr) {
+ SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+ MachineSDNode *MovHighBits = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+ VAddr = SDValue(MovHighBits, 0);
+
+ // In a call sequence, stores to the argument stack area are relative to the
+ // stack pointer.
+ const MachinePointerInfo &PtrInfo
+ = cast<MemSDNode>(Parent)->getPointerInfo();
+ SOffset = isStackPtrRelative(PtrInfo)
+ ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+ : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+ return true;
+ }
}
if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1577,12 +1559,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
- unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
- Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
// FIXME: Get from MachinePointerInfo? We should only be using the frame
// offset if we know this is in a call sequence.
- SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+ SOffset = isStackPtrRelative(PtrInfo)
+ ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+ : CurDAG->getTargetConstant(0, DL, MVT::i32);
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
@@ -1646,6 +1628,37 @@ static MemSDNode* findMemSDNode(SDNode *N) {
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+ SDValue &N0, SDValue &N1) {
+ if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+ Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+ // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+ // (i64 (bitcast (v2i32 (build_vector
+ // (or (extract_vector_elt V, 0), OFFSET),
+ // (extract_vector_elt V, 1)))))
+ SDValue Lo = Addr.getOperand(0).getOperand(0);
+ if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+ SDValue BaseLo = Lo.getOperand(0);
+ SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+ // Check that split base (Lo and Hi) are extracted from the same one.
+ if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+ // Lo is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+ BaseLo.getConstantOperandVal(1) == 0 &&
+ // Hi is statically extracted from index 0.
+ isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+ BaseHi.getConstantOperandVal(1) == 1) {
+ N0 = BaseLo.getOperand(0).getOperand(0);
+ N1 = Lo.getOperand(1);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
template <bool IsSigned>
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDValue Addr,
@@ -1656,84 +1669,91 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
if (Subtarget->hasFlatInstOffsets() &&
(!Subtarget->hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
- unsigned AS = findMemSDNode(N)->getAddressSpace();
- if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- } else {
- // If the offset doesn't fit, put the low bits into the offset field and
- // add the rest.
-
- SDLoc DL(N);
- uint64_t ImmField;
- const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
- if (IsSigned) {
- ImmField = SignExtend64(COffsetVal, NumBits);
-
- // Don't use a negative offset field if the base offset is positive.
- // Since the scheduler currently relies on the offset field, doing so
- // could result in strange scheduling decisions.
-
- // TODO: Should we not do this in the opposite direction as well?
- if (static_cast<int64_t>(COffsetVal) > 0) {
- if (static_cast<int64_t>(ImmField) < 0) {
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
- ImmField = COffsetVal & OffsetMask;
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+ SDValue N0, N1;
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ N0 = Addr.getOperand(0);
+ N1 = Addr.getOperand(1);
+ } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+ assert(N0 && N1 && isa<ConstantSDNode>(N1));
+ }
+ if (N0 && N1) {
+ uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned AS = findMemSDNode(N)->getAddressSpace();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ } else {
+ // If the offset doesn't fit, put the low bits into the offset field and
+ // add the rest.
+
+ SDLoc DL(N);
+ uint64_t ImmField;
+ const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+ if (IsSigned) {
+ ImmField = SignExtend64(COffsetVal, NumBits);
+
+ // Don't use a negative offset field if the base offset is positive.
+ // Since the scheduler currently relies on the offset field, doing so
+ // could result in strange scheduling decisions.
+
+ // TODO: Should we not do this in the opposite direction as well?
+ if (static_cast<int64_t>(COffsetVal) > 0) {
+ if (static_cast<int64_t>(ImmField) < 0) {
+ const uint64_t OffsetMask =
+ maskTrailingOnes<uint64_t>(NumBits - 1);
+ ImmField = COffsetVal & OffsetMask;
+ }
}
+ } else {
+ // TODO: Should we do this for a negative offset?
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+ ImmField = COffsetVal & OffsetMask;
}
- } else {
- // TODO: Should we do this for a negative offset?
- const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
- ImmField = COffsetVal & OffsetMask;
- }
- uint64_t RemainderOffset = COffsetVal - ImmField;
+ uint64_t RemainderOffset = COffsetVal - ImmField;
- assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
- assert(RemainderOffset + ImmField == COffsetVal);
+ assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+ assert(RemainderOffset + ImmField == COffsetVal);
- OffsetVal = ImmField;
+ OffsetVal = ImmField;
- // TODO: Should this try to use a scalar add pseudo if the base address is
- // uniform and saddr is usable?
- SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
- SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+ // TODO: Should this try to use a scalar add pseudo if the base address
+ // is uniform and saddr is usable?
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
- SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub0);
- SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
- DL, MVT::i32, N0, Sub1);
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+ MVT::i32, N0, Sub1);
- SDValue AddOffsetLo
- = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SDValue AddOffsetHi
- = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+ SDValue AddOffsetLo =
+ getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue AddOffsetHi =
+ getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
- SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
- SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SDNode *Add = CurDAG->getMachineNode(
- AMDGPU::V_ADD_I32_e64, DL, VTs,
- {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+ SDNode *Add =
+ CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
- SDNode *Addc = CurDAG->getMachineNode(
- AMDGPU::V_ADDC_U32_e64, DL, VTs,
- {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
- SDValue RegSequenceArgs[] = {
- CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
- SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
- };
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
- Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::i64, RegSequenceArgs), 0);
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs),
+ 0);
+ }
}
}
@@ -1761,35 +1781,52 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
SDValue &Offset, bool &Imm) const {
-
- // FIXME: Handle non-constant offsets.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
- if (!C)
+ if (!C) {
+ if (ByteOffsetNode.getValueType().isScalarInteger() &&
+ ByteOffsetNode.getValueType().getSizeInBits() == 32) {
+ Offset = ByteOffsetNode;
+ Imm = false;
+ return true;
+ }
+ if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
+ if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
+ Offset = ByteOffsetNode.getOperand(0);
+ Imm = false;
+ return true;
+ }
+ }
return false;
+ }
SDLoc SL(ByteOffsetNode);
- GCNSubtarget::Generation Gen = Subtarget->getGeneration();
+ // GFX9 and GFX10 have signed byte immediate offsets.
int64_t ByteOffset = C->getSExtValue();
- int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
-
- if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
- Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ Optional<int64_t> EncodedOffset =
+ AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
+ if (EncodedOffset) {
+ Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
Imm = true;
return true;
}
- if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+ // SGPR and literal offsets are unsigned.
+ if (ByteOffset < 0)
return false;
- if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
- // 32-bit Immediates are supported on Sea Islands.
- Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
- } else {
- SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
- Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
- C32Bit), 0);
+ EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
+ if (EncodedOffset) {
+ Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+ return true;
}
- Imm = false;
+
+ if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
+ return false;
+
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+
return true;
}
@@ -1825,14 +1862,21 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
// wraparound, because s_load instructions perform the addition in 64 bits.
if ((Addr.getValueType() != MVT::i32 ||
- Addr->getFlags().hasNoUnsignedWrap()) &&
- CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
-
- if (SelectSMRDOffset(N1, Offset, Imm)) {
- SBase = Expand32BitAddress(N0);
- return true;
+ Addr->getFlags().hasNoUnsignedWrap())) {
+ SDValue N0, N1;
+ // Extract the base and offset if possible.
+ if (CurDAG->isBaseWithConstantOffset(Addr) ||
+ Addr.getOpcode() == ISD::ADD) {
+ N0 = Addr.getOperand(0);
+ N1 = Addr.getOperand(1);
+ } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+ assert(N0 && N1 && isa<ConstantSDNode>(N1));
+ }
+ if (N0 && N1) {
+ if (SelectSMRDOffset(N1, Offset, Imm)) {
+ SBase = Expand32BitAddress(N0);
+ return true;
+ }
}
}
SBase = Expand32BitAddress(Addr);
@@ -1843,17 +1887,16 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm;
+ bool Imm = false;
return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
- return false;
+ assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- bool Imm;
+ bool Imm = false;
if (!SelectSMRD(Addr, SBase, Offset, Imm))
return false;
@@ -1862,27 +1905,38 @@ bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm;
+ bool Imm = false;
return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
!isa<ConstantSDNode>(Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
SDValue &Offset) const {
- bool Imm;
- return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
+ // The immediate offset for S_BUFFER instructions is unsigned.
+ if (auto Imm =
+ AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
+ Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ }
+
+ return false;
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
SDValue &Offset) const {
- if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
- return false;
+ assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- bool Imm;
- if (!SelectSMRDOffset(Addr, Offset, Imm))
- return false;
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
+ if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
+ C->getZExtValue())) {
+ Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
+ return true;
+ }
+ }
- return !Imm && isa<ConstantSDNode>(Offset);
+ return false;
}
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
@@ -1898,7 +1952,9 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
// (add n0, c0)
// Don't peel off the offset (c0) if doing so could possibly lead
// the base (n0) to be negative.
- if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
+ // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
+ if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
+ (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
Base = N0;
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
@@ -2066,7 +2122,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
- unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC();
+ Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);
if (!UseSCCBr) {
@@ -2121,7 +2177,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
- assert((IsFMA || !Mode.FP32Denormals) &&
+ assert((IsFMA || !Mode.allFP32Denormals()) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
@@ -2338,6 +2394,64 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
+void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
+ if (Subtarget->getLDSBankCount() != 16) {
+ // This is a single instruction with a pattern.
+ SelectCode(N);
+ return;
+ }
+
+ SDLoc DL(N);
+
+ // This requires 2 instructions. It is possible to write a pattern to support
+ // this, but the generated isel emitter doesn't correctly deal with multiple
+ // output instructions using the same physical register input. The copy to m0
+ // is incorrectly placed before the second instruction.
+ //
+ // TODO: Match source modifiers.
+ //
+ // def : Pat <
+ // (int_amdgcn_interp_p1_f16
+ // (VOP3Mods f32:$src0, i32:$src0_modifiers),
+ // (i32 timm:$attrchan), (i32 timm:$attr),
+ // (i1 timm:$high), M0),
+ // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
+ // timm:$attrchan, 0,
+ // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
+ // let Predicates = [has16BankLDS];
+ // }
+
+ // 16 bank LDS
+ SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
+ N->getOperand(5), SDValue());
+
+ SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
+
+ SDNode *InterpMov =
+ CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
+ CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ ToM0.getValue(1) // In glue
+ });
+
+ SDNode *InterpP1LV =
+ CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+ N->getOperand(1), // Src0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+ SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+ N->getOperand(4), // high
+ CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+ SDValue(InterpMov, 1)
+ });
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
+}
+
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
@@ -2366,6 +2480,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_wwm:
Opcode = AMDGPU::WWM;
break;
+ case Intrinsic::amdgcn_interp_p1_f16:
+ SelectInterpP1F16(N);
+ return;
default:
SelectCode(N);
return;
@@ -2428,15 +2545,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
return isNoNanSrc(Src);
}
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src,
- SDValue &SrcMods) const {
- if (In.getValueType() == MVT::f32)
- return SelectVOP3Mods(In, Src, SrcMods);
- Src = In;
- SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);;
- return true;
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
return false;
@@ -2520,17 +2628,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Clamp) const {
- SDLoc SL(In);
-
- // FIXME: Handle clamp and op_sel
- Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
- return SelectVOP3PMods(In, Src, SrcMods);
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
@@ -2539,34 +2636,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Clamp) const {
- SDLoc SL(In);
-
- // FIXME: Handle clamp
- Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
- return SelectVOP3OpSel(In, Src, SrcMods);
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
// FIXME: Handle op_sel
return SelectVOP3Mods(In, Src, SrcMods);
}
-bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src,
- SDValue &SrcMods,
- SDValue &Clamp) const {
- SDLoc SL(In);
-
- // FIXME: Handle clamp
- Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
- return SelectVOP3OpSelMods(In, Src, SrcMods);
-}
-
// The return value is not whether the match is possible (which it always is),
// but whether or not it a conversion is really used.
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
@@ -2705,7 +2780,7 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
(
Subtarget->getScalarizeGlobalBehavior() &&
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
- !Ld->isVolatile() &&
+ Ld->isSimple() &&
!N->isDivergent() &&
static_cast<const SITargetLowering *>(
getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 23cc9404532d5..940ec6f31c698 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -16,7 +16,6 @@
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -38,6 +37,11 @@ using namespace llvm;
#include "AMDGPUGenCallingConv.inc"
+static cl::opt<bool> AMDGPUBypassSlowDiv(
+ "amdgpu-bypass-slow-div",
+ cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
+ cl::init(true));
+
// Find a larger type to do a load / store of a vector with.
EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
unsigned StoreSize = VT.getStoreSizeInBits();
@@ -103,6 +107,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+
+ setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
+
+ setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
+
+ setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
+
+ setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
+
+ setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
+
// There are no 64-bit extloads. These should be done as a 32-bit extload and
// an extension to 64-bit.
for (MVT VT : MVT::integer_valuetypes()) {
@@ -161,11 +183,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
@@ -203,6 +227,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v4i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v4f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v8i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v8f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v16i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
+
+ setOperationAction(ISD::STORE, MVT::v16f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
+
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
@@ -227,12 +269,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
+ setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
+ setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
setOperationAction(ISD::Constant, MVT::i32, Legal);
setOperationAction(ISD::Constant, MVT::i64, Legal);
@@ -297,6 +348,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
@@ -329,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBE, VT, Legal);
}
+ // The hardware supports 32-bit FSHR, but not FSHL.
+ setOperationAction(ISD::FSHR, MVT::i32, Legal);
+
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
@@ -381,7 +443,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
- setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
@@ -483,6 +545,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemmove = 0xffffffff;
MaxStoresPerMemset = 0xffffffff;
+ // The expansion for 64-bit division is enormous.
+ if (AMDGPUBypassSlowDiv)
+ addBypassSlowDiv(64, 32);
+
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
@@ -609,6 +675,17 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
return true;
}
+EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const {
+ assert(!VT.isVector() && "only scalar expected");
+
+ // Round to the next multiple of 32-bits.
+ unsigned Size = VT.getSizeInBits();
+ if (Size <= 32)
+ return MVT::i32;
+ return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
+}
+
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
return MVT::i32;
}
@@ -641,8 +718,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
unsigned NewSize = NewVT.getStoreSizeInBits();
- // If we are reducing to a 32-bit load, this is always better.
- if (NewSize == 32)
+ // If we are reducing to a 32-bit load or a smaller multi-dword load,
+ // this is always better.
+ if (NewSize >= 32)
return true;
EVT OldVT = N->getValueType(0);
@@ -733,6 +811,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
}
}
+SDValue AMDGPUTargetLowering::getNegatedExpression(
+ SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
+ NegatibleCost &Cost, unsigned Depth) const {
+
+ switch (Op.getOpcode()) {
+ case ISD::FMA:
+ case ISD::FMAD: {
+ // Negating a fma is not free if it has users without source mods.
+ if (!allUsesHaveSourceMods(Op.getNode()))
+ return SDValue();
+ break;
+ }
+ default:
+ break;
+ }
+
+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+ ForCodeSize, Cost, Depth);
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -912,7 +1010,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
CallingConv::ID CC = Fn.getCallingConv();
- unsigned MaxAlign = 1;
+ Align MaxAlign = Align(1);
uint64_t ExplicitArgOffset = 0;
const DataLayout &DL = Fn.getParent()->getDataLayout();
@@ -920,12 +1018,12 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
for (const Argument &Arg : Fn.args()) {
Type *BaseArgTy = Arg.getType();
- unsigned Align = DL.getABITypeAlignment(BaseArgTy);
- MaxAlign = std::max(Align, MaxAlign);
+ Align Alignment = DL.getABITypeAlign(BaseArgTy);
+ MaxAlign = std::max(Alignment, MaxAlign);
unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
- uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
- ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
// We're basically throwing away everything passed into us and starting over
// to get accurate in-memory offsets. The "PartOffset" is completely useless
@@ -999,6 +1097,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
assert(MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5);
MemVT = MemVT.getPow2VectorType(State.getContext());
+ } else if (!MemVT.isSimple() && !MemVT.isVector()) {
+ MemVT = MemVT.getRoundIntegerType(State.getContext());
}
unsigned PartOffset = 0;
@@ -1140,7 +1240,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::FLOG:
- return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
+ return LowerFLOG(Op, DAG, numbers::ln2f);
case ISD::FLOG10:
return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
case ISD::FEXP:
@@ -1196,10 +1296,23 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isEntryFunction()) {
+ SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
- Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+ Fn, "local memory global used by non-kernel function",
+ DL.getDebugLoc(), DS_Warning);
DAG.getContext()->diagnose(BadLDSDecl);
+
+ // We currently don't have a way to correctly allocate LDS objects that
+ // aren't directly associated with a kernel. We do force inlining of
+ // functions that use local objects. However, if these dead functions are
+ // not eliminated, we don't want a compile time error. Just emit a warning
+ // and a trap, since there should be no callable path here.
+ SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
+ SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ Trap, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ return DAG.getUNDEF(Op.getValueType());
}
// XXX: What does the value of G->getOffset() mean?
@@ -1208,7 +1321,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
// TODO: We could emit code to handle the initialization somewhere.
if (!hasDefinedInitializer(GV)) {
- unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+ unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
}
}
@@ -1383,12 +1496,11 @@ AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
(HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!");
- auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
- DAG.getConstant(0, DL, IdxTy));
+ DAG.getVectorIdxConstant(0, DL));
SDValue Hi = DAG.getNode(
HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
- HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
+ HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
return std::make_pair(Lo, Hi);
}
@@ -1433,18 +1545,17 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
- auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
SDValue Join;
if (LoVT == HiVT) {
// This is the case that the vector is power of two so was evenly split.
Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
} else {
Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
- DAG.getConstant(0, SL, IdxTy));
- Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
- : ISD::INSERT_VECTOR_ELT,
- SL, VT, Join, HiLoad,
- DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
+ DAG.getVectorIdxConstant(0, SL));
+ Join = DAG.getNode(
+ HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
+ VT, Join, HiLoad,
+ DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
}
SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
@@ -1474,7 +1585,7 @@ SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
return DAG.getMergeValues(
{DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
- DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
+ DAG.getVectorIdxConstant(0, SL)),
WideLoad.getValue(1)},
SL);
}
@@ -1588,9 +1699,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
// float fr = mad(fqneg, fb, fa);
- unsigned OpCode = MFI->getMode().FP32Denormals ?
- (unsigned)AMDGPUISD::FMAD_FTZ :
- (unsigned)ISD::FMAD;
+ unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
+ (unsigned)ISD::FMA :
+ !MFI->getMode().allFP32Denormals() ?
+ (unsigned)ISD::FMAD :
+ (unsigned)AMDGPUISD::FMAD_FTZ;
SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
// int iq = (int)fq;
@@ -1673,9 +1786,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// Compute denominator reciprocal.
- unsigned FMAD = MFI->getMode().FP32Denormals ?
- (unsigned)AMDGPUISD::FMAD_FTZ :
- (unsigned)ISD::FMAD;
+ unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
+ (unsigned)ISD::FMA :
+ !MFI->getMode().allFP32Denormals() ?
+ (unsigned)ISD::FMAD :
+ (unsigned)AMDGPUISD::FMAD_FTZ;
SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
@@ -1861,103 +1976,43 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
return Res;
}
- SDValue Num = Op.getOperand(0);
- SDValue Den = Op.getOperand(1);
-
- // RCP = URECIP(Den) = 2^32 / Den + e
- // e is rounding error.
- SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
-
- // RCP_LO = mul(RCP, Den) */
- SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
-
- // RCP_HI = mulhu (RCP, Den) */
- SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
-
- // NEG_RCP_LO = -RCP_LO
- SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- RCP_LO);
-
- // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
- SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
- NEG_RCP_LO, RCP_LO,
- ISD::SETEQ);
- // Calculate the rounding error from the URECIP instruction
- // E = mulhu(ABS_RCP_LO, RCP)
- SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
-
- // RCP_A_E = RCP + E
- SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
-
- // RCP_S_E = RCP - E
- SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
-
- // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
- SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
- RCP_A_E, RCP_S_E,
- ISD::SETEQ);
- // Quotient = mulhu(Tmp0, Num)
- SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
-
- // Num_S_Remainder = Quotient * Den
- SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
-
- // Remainder = Num - Num_S_Remainder
- SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
-
- // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
- SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
- DAG.getConstant(-1, DL, VT),
- DAG.getConstant(0, DL, VT),
- ISD::SETUGE);
- // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
- SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
- Num_S_Remainder,
- DAG.getConstant(-1, DL, VT),
- DAG.getConstant(0, DL, VT),
- ISD::SETUGE);
- // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
- SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
- Remainder_GE_Zero);
-
- // Calculate Division result:
-
- // Quotient_A_One = Quotient + 1
- SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
- DAG.getConstant(1, DL, VT));
-
- // Quotient_S_One = Quotient - 1
- SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
- DAG.getConstant(1, DL, VT));
-
- // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
- SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
- Quotient, Quotient_A_One, ISD::SETEQ);
-
- // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
- Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
- Quotient_S_One, Div, ISD::SETEQ);
-
- // Calculate Rem result:
-
- // Remainder_S_Den = Remainder - Den
- SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
-
- // Remainder_A_Den = Remainder + Den
- SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
-
- // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
- SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
- Remainder, Remainder_S_Den, ISD::SETEQ);
-
- // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
- Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
- Remainder_A_Den, Rem, ISD::SETEQ);
- SDValue Ops[2] = {
- Div,
- Rem
- };
- return DAG.getMergeValues(Ops, DL);
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+
+ // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
+ // algorithm used here.
+
+ // Initial estimate of inv(y).
+ SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
+
+ // One round of UNR.
+ SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
+ SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
+ Z = DAG.getNode(ISD::ADD, DL, VT, Z,
+ DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
+
+ // Quotient/remainder estimate.
+ SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
+ SDValue R =
+ DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
+
+ // First quotient/remainder refinement.
+ EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
+ Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
+ R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
+
+ // Second quotient/remainder refinement.
+ Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
+ Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
+ R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
+
+ return DAG.getMergeValues({Q, R}, DL);
}
SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
@@ -2164,8 +2219,7 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
// Don't handle v2f16. The extra instructions to scalarize and repack around the
// compare and vselect end up producing worse code than scalarizing the whole
// operation.
-SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
EVT VT = Op.getValueType();
@@ -2194,75 +2248,6 @@ SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
}
-SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
- SDLoc SL(Op);
- SDValue X = Op.getOperand(0);
-
- SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
-
- const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
- const SDValue One = DAG.getConstant(1, SL, MVT::i32);
- const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
- const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
- EVT SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
-
- SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
-
- SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
-
- SDValue Exp = extractF64Exponent(Hi, SL, DAG);
-
- const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
- MVT::i64);
-
- SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
- SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
- DAG.getConstant(INT64_C(0x0008000000000000), SL,
- MVT::i64),
- Exp);
-
- SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
- SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
- DAG.getConstant(0, SL, MVT::i64), Tmp0,
- ISD::SETNE);
-
- SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
- D, DAG.getConstant(0, SL, MVT::i64));
- SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
-
- K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
- K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
-
- SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
- SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
- SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
-
- SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
- ExpEqNegOne,
- DAG.getConstantFP(1.0, SL, MVT::f64),
- DAG.getConstantFP(0.0, SL, MVT::f64));
-
- SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
-
- K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
- K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
-
- return K;
-}
-
-SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
-
- if (isOperationLegal(ISD::FTRUNC, VT))
- return LowerFROUND_LegalFTRUNC(Op, DAG);
-
- if (VT == MVT::f64)
- return LowerFROUND64(Op, DAG);
-
- llvm_unreachable("unhandled type");
-}
-
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2793,6 +2778,7 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
static SDValue simplifyI24(SDNode *Node24,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
@@ -2806,11 +2792,11 @@ static SDValue simplifyI24(SDNode *Node24,
APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
- // First try to simplify using GetDemandedBits which allows the operands to
- // have other uses, but will only perform simplifications that involve
- // bypassing some nodes for this user.
- SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
- SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
+ // First try to simplify using SimplifyMultipleUseDemandedBits which allows
+ // the operands to have other uses, but will only perform simplifications that
+ // involve bypassing some nodes for this user.
+ SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
+ SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
if (DemandedLHS || DemandedRHS)
return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
DemandedLHS ? DemandedLHS : LHS,
@@ -2818,7 +2804,6 @@ static SDValue simplifyI24(SDNode *Node24,
// Now try SimplifyDemandedBits which can simplify the nodes used by our
// operands if this node is the only user.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
return SDValue(Node24, 0);
if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
@@ -2877,7 +2862,7 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
return SDValue();
LoadSDNode *LN = cast<LoadSDNode>(N);
- if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+ if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
return SDValue();
SDLoc SL(N);
@@ -2885,16 +2870,17 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
EVT VT = LN->getMemoryVT();
unsigned Size = VT.getStoreSize();
- unsigned Align = LN->getAlignment();
- if (Align < Size && isTypeLegal(VT)) {
+ Align Alignment = LN->getAlign();
+ if (Alignment < Size && isTypeLegal(VT)) {
bool IsFast;
unsigned AS = LN->getAddressSpace();
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(
- VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
+ LN->getMemOperand()->getFlags(),
+ &IsFast)) {
SDValue Ops[2];
if (VT.isVector())
@@ -2931,7 +2917,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
return SDValue();
StoreSDNode *SN = cast<StoreSDNode>(N);
- if (SN->isVolatile() || !ISD::isNormalStore(SN))
+ if (!SN->isSimple() || !ISD::isNormalStore(SN))
return SDValue();
EVT VT = SN->getMemoryVT();
@@ -2939,8 +2925,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
- unsigned Align = SN->getAlignment();
- if (Align < Size && isTypeLegal(VT)) {
+ Align Alignment = SN->getAlign();
+ if (Alignment < Size && isTypeLegal(VT)) {
bool IsFast;
unsigned AS = SN->getAddressSpace();
@@ -2948,8 +2934,9 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(
- VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
+ SN->getMemOperand()->getFlags(),
+ &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3012,6 +2999,16 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_mul_u24:
return simplifyI24(N, DCI);
+ case Intrinsic::amdgcn_fract:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_ldexp: {
+ // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
+ SDValue Src = N->getOperand(1);
+ return Src.isUndef() ? Src : SDValue();
+ }
default:
return SDValue();
}
@@ -3465,24 +3462,24 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue CmpLHS = Cond.getOperand(0);
- unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
- AMDGPUISD::FFBH_U32;
-
// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
if (CCOpcode == ISD::SETEQ &&
(isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
- RHS.getOperand(0) == CmpLHS &&
- isNegativeOne(LHS)) {
+ RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
+ unsigned Opc =
+ isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
if (CCOpcode == ISD::SETNE &&
- (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
- LHS.getOperand(0) == CmpLHS &&
- isNegativeOne(RHS)) {
+ (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
+ LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
+ unsigned Opc =
+ isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
+
return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
@@ -4117,12 +4114,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
- unsigned Reg, EVT VT,
+ Register Reg, EVT VT,
const SDLoc &SL,
bool RawReg) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned VReg;
+ Register VReg;
if (!MRI.isLiveIn(Reg)) {
VReg = MRI.createVirtualRegister(RC);
@@ -4266,11 +4263,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DIV_FMAS)
NODE_NAME_CASE(DIV_FIXUP)
NODE_NAME_CASE(FMAD_FTZ)
- NODE_NAME_CASE(TRIG_PREOP)
NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RCP_LEGACY)
- NODE_NAME_CASE(RSQ_LEGACY)
NODE_NAME_CASE(RCP_IFLAG)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
@@ -4298,8 +4293,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MAD_U64_U32)
NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
- NODE_NAME_CASE(EXPORT)
- NODE_NAME_CASE(EXPORT_DONE)
NODE_NAME_CASE(R600_EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
@@ -4323,12 +4316,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
NODE_NAME_CASE(LDS)
- NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
- NODE_NAME_CASE(INTERP_P1LL_F16)
- NODE_NAME_CASE(INTERP_P1LV_F16)
- NODE_NAME_CASE(INTERP_P2_F16)
NODE_NAME_CASE(LOAD_D16_HI)
NODE_NAME_CASE(LOAD_D16_LO)
NODE_NAME_CASE(LOAD_D16_HI_I8)
@@ -4347,6 +4336,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ATOMIC_DEC)
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
+ NODE_NAME_CASE(ATOMIC_LOAD_CSUB)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -4373,6 +4363,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_INC)
NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+ NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
NODE_NAME_CASE(ATOMIC_PK_FADD)
@@ -4539,11 +4530,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
}
case AMDGPUISD::LDS: {
auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
- unsigned Align = GA->getGlobal()->getAlignment();
+ Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
Known.Zero.setHighBits(16);
- if (Align)
- Known.Zero.setLowBits(Log2_32(Align));
+ Known.Zero.setLowBits(Log2(Alignment));
break;
}
case ISD::INTRINSIC_WO_CHAIN: {
@@ -4607,6 +4597,29 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
}
}
+unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
+ GISelKnownBits &Analysis, Register R,
+ const APInt &DemandedElts, const MachineRegisterInfo &MRI,
+ unsigned Depth) const {
+ const MachineInstr *MI = MRI.getVRegDef(R);
+ if (!MI)
+ return 1;
+
+ // TODO: Check range metadata on MMO.
+ switch (MI->getOpcode()) {
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ return 25;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ return 17;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ return 24;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ return 16;
+ default:
+ return 1;
+ }
+}
+
bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
@@ -4648,7 +4661,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
- case AMDGPUISD::RSQ_LEGACY:
case AMDGPUISD::RSQ_CLAMP: {
if (SNaN)
return true;
@@ -4665,7 +4677,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::DIV_SCALE:
case AMDGPUISD::DIV_FMAS:
case AMDGPUISD::DIV_FIXUP:
- case AMDGPUISD::TRIG_PREOP:
// TODO: Refine on operands.
return SNaN;
case AMDGPUISD::SIN_HW:
@@ -4692,6 +4703,18 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
}
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_rsq_clamp: {
+ if (SNaN)
+ return true;
+
+ // TODO: Need is known positive check.
+ return false;
+ }
+ case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_fdot2:
// TODO: Refine on operand
return SNaN;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a90b7f5653dcc..85f23c81db170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -18,6 +18,7 @@
#include "AMDGPU.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -52,8 +53,6 @@ protected:
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFROUND_LegalFTRUNC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
@@ -172,8 +171,16 @@ public:
bool isZExtFree(EVT Src, EVT Dest) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations, bool ForCodeSize,
+ NegatibleCost &Cost,
+ unsigned Depth) const override;
+
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+ EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const override;
+
MVT getVectorIdxTy(const DataLayout &) const override;
bool isSelectSupported(SelectSupportKind) const override;
@@ -264,6 +271,12 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis,
+ Register R,
+ const APInt &DemandedElts,
+ const MachineRegisterInfo &MRI,
+ unsigned Depth = 0) const override;
+
bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN = false,
@@ -276,19 +289,19 @@ public:
/// a copy from the register.
SDValue CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
- unsigned Reg, EVT VT,
+ Register Reg, EVT VT,
const SDLoc &SL,
bool RawReg = false) const;
SDValue CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const {
+ Register Reg, EVT VT) const {
return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
}
// Returns the raw live in register rather than a copy from it.
SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const {
+ Register Reg, EVT VT) const {
return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
}
@@ -398,14 +411,12 @@ enum NodeType : unsigned {
// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
// treated as an illegal operation.
FMAD_FTZ,
- TRIG_PREOP, // 1 ULP max error for f64
// RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
// For f64, max error 2^29 ULP, handles denormals.
RCP,
RSQ,
RCP_LEGACY,
- RSQ_LEGACY,
RCP_IFLAG,
FMUL_LEGACY,
RSQ_CLAMP,
@@ -433,8 +444,6 @@ enum NodeType : unsigned {
MUL_LOHI_U24,
PERM,
TEXTURE_FETCH,
- EXPORT, // exp on SI+
- EXPORT_DONE, // exp on SI+ with done bit set
R600_EXPORT,
CONST_ADDRESS,
REGISTER_LOAD,
@@ -476,12 +485,8 @@ enum NodeType : unsigned {
BUILD_VERTICAL_VECTOR,
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
- INTERP_P1LL_F16,
- INTERP_P1LV_F16,
- INTERP_P2_F16,
PC_ADD_REL_OFFSET,
LDS,
- KILL,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
@@ -503,6 +508,7 @@ enum NodeType : unsigned {
ATOMIC_DEC,
ATOMIC_LOAD_FMIN,
ATOMIC_LOAD_FMAX,
+ ATOMIC_LOAD_CSUB,
BUFFER_LOAD,
BUFFER_LOAD_UBYTE,
BUFFER_LOAD_USHORT,
@@ -529,6 +535,7 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_INC,
BUFFER_ATOMIC_DEC,
BUFFER_ATOMIC_CMPSWAP,
+ BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
BUFFER_ATOMIC_PK_FADD,
ATOMIC_PK_FADD,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
index 64d761997b0cc..3b5d91133a2f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -23,7 +23,6 @@
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -67,9 +66,9 @@ public:
static char ID; // Pass identification, replacement for typeid
- unsigned getInlineThreshold(CallSite CS) const;
+ unsigned getInlineThreshold(CallBase &CB) const;
- InlineCost getInlineCost(CallSite CS) override;
+ InlineCost getInlineCost(CallBase &CB) override;
bool runOnSCC(CallGraphSCC &SCC) override;
@@ -106,13 +105,13 @@ void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
LegacyInlinerBase::getAnalysisUsage(AU);
}
-unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
+unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const {
int Thres = Params.DefaultThreshold;
- Function *Caller = CS.getCaller();
+ Function *Caller = CB.getCaller();
// Listen to the inlinehint attribute when it would increase the threshold
// and the caller does not need to minimize its size.
- Function *Callee = CS.getCalledFunction();
+ Function *Callee = CB.getCalledFunction();
bool InlineHint = Callee && !Callee->isDeclaration() &&
Callee->hasFnAttribute(Attribute::InlineHint);
if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
@@ -129,7 +128,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
// Increase the inline threshold to allow inliniting in this case.
uint64_t AllocaSize = 0;
SmallPtrSet<const AllocaInst *, 8> AIVisited;
- for (Value *PtrArg : CS.args()) {
+ for (Value *PtrArg : CB.args()) {
PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
@@ -156,8 +155,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
// Check if call is just a wrapper around another call.
// In this case we only have call and ret instructions.
-static bool isWrapperOnlyCall(CallSite CS) {
- Function *Callee = CS.getCalledFunction();
+static bool isWrapperOnlyCall(CallBase &CB) {
+ Function *Callee = CB.getCalledFunction();
if (!Callee || Callee->size() != 1)
return false;
const BasicBlock &BB = Callee->getEntryBlock();
@@ -174,32 +173,32 @@ static bool isWrapperOnlyCall(CallSite CS) {
return false;
}
-InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
- Function *Callee = CS.getCalledFunction();
- Function *Caller = CS.getCaller();
+InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) {
+ Function *Callee = CB.getCalledFunction();
+ Function *Caller = CB.getCaller();
if (!Callee || Callee->isDeclaration())
return llvm::InlineCost::getNever("undefined callee");
- if (CS.isNoInline())
+ if (CB.isNoInline())
return llvm::InlineCost::getNever("noinline");
TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
if (!TTI.areInlineCompatible(Caller, Callee))
return llvm::InlineCost::getNever("incompatible");
- if (CS.hasFnAttr(Attribute::AlwaysInline)) {
+ if (CB.hasFnAttr(Attribute::AlwaysInline)) {
auto IsViable = isInlineViable(*Callee);
- if (IsViable)
+ if (IsViable.isSuccess())
return llvm::InlineCost::getAlways("alwaysinline viable");
- return llvm::InlineCost::getNever(IsViable.message);
+ return llvm::InlineCost::getNever(IsViable.getFailureReason());
}
- if (isWrapperOnlyCall(CS))
+ if (isWrapperOnlyCall(CB))
return llvm::InlineCost::getAlways("wrapper-only call");
InlineParams LocalParams = Params;
- LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
+ LocalParams.DefaultThreshold = (int)getInlineThreshold(CB);
bool RemarksEnabled = false;
const auto &BBs = Caller->getBasicBlockList();
if (!BBs.empty()) {
@@ -209,14 +208,13 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
}
OptimizationRemarkEmitter ORE(Caller);
- std::function<AssumptionCache &(Function &)> GetAssumptionCache =
- [this](Function &F) -> AssumptionCache & {
+ auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & {
return ACT->getAssumptionCache(F);
};
- auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
- LocalParams, TTI, GetAssumptionCache, None, PSI,
- RemarksEnabled ? &ORE : nullptr);
+ auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI,
+ GetAssumptionCache, GetTLI, nullptr, PSI,
+ RemarksEnabled ? &ORE : nullptr);
if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
// Single BB does not increase total BB amount, thus subtract 1
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 9951cbf2326e3..6c13bc8599dbb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -13,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUInstrInfo.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 698189e14c21e..61b78acad3f4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -55,6 +55,9 @@ struct ImageDimIntrinsicInfo {
};
const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
+const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode,
+ unsigned Dim);
+
} // end AMDGPU namespace
} // End llvm namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 50c451be4b867..894677ec68b60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
-// This file contains DAG node defintions for the AMDGPU target.
+// This file contains DAG node definitions for the AMDGPU target.
//
//===----------------------------------------------------------------------===//
@@ -18,10 +18,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
]>;
-def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
- [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
->;
-
def AMDGPULdExpOp : SDTypeProfile<1, 2,
[SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
>;
@@ -121,8 +117,6 @@ def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
-// out = 1.0 / sqrt(a)
-def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
@@ -151,7 +145,7 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
[]
>;
-def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
+def AMDGPUfmul_legacy_impl : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]
>;
@@ -204,13 +198,6 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
-def AMDGPUSetRegOp : SDTypeProfile<0, 2, [
- SDTCisInt<0>, SDTCisInt<1>
-]>;
-
-def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
- SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
-
def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
@@ -238,7 +225,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
// Special case divide FMA with scale and flags (src0 = Quotient,
// src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
+def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
[SDNPOptInGlue]>;
// Single or double precision division fixup.
@@ -248,9 +235,6 @@ def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
-// Look Up 2.0 / pi src0 with segment select src1[4:0]
-def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
-
def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
[SDNPHasChain, SDNPMayLoad]>;
@@ -278,18 +262,18 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
def AMDGPUround : SDNode<"ISD::FROUND",
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
-def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
-def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
-def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
+def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
-def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
+def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
-// when performing the mulitply. The result is a 32-bit value.
+// when performing the multiply. The result is a 32-bit value.
def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]
>;
@@ -321,7 +305,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
-def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
SDTCisFP<0>, SDTCisVec<1>,
SDTCisInt<4>]>,
@@ -329,21 +313,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
-def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
- SDTypeProfile<1, 7, [SDTCisFP<0>]>,
- [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
- SDTypeProfile<1, 9, [SDTCisFP<0>]>,
- [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
- SDTypeProfile<1, 8, [SDTCisFP<0>]>,
- [SDNPInGlue]>;
-
-def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
- [SDNPHasChain, SDNPSideEffect]>;
-
// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
@@ -358,12 +327,6 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [
]>;
-def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
- [SDNPHasChain, SDNPMayStore]>;
-
-def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
- [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
-
def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
@@ -398,7 +361,7 @@ def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPt
//===----------------------------------------------------------------------===//
-// Intrinsic/Custom node compatability PatFrags
+// Intrinsic/Custom node compatibility PatFrags
//===----------------------------------------------------------------------===//
def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
@@ -406,9 +369,6 @@ def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src),
(AMDGPUrcp_legacy_impl node:$src)]>;
-def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src),
- (AMDGPUrsq_legacy_impl node:$src)]>;
-
def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src),
(AMDGPUrsq_impl node:$src)]>;
@@ -442,6 +402,14 @@ def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
[(int_amdgcn_sffbh node:$src),
(AMDGPUffbh_i32_impl node:$src)]>;
+def AMDGPUffbh_u32 : PatFrags<(ops node:$src),
+ [(ctlz_zero_undef node:$src),
+ (AMDGPUffbh_u32_impl node:$src)]>;
+
+def AMDGPUffbl_b32 : PatFrags<(ops node:$src),
+ [(cttz_zero_undef node:$src),
+ (AMDGPUffbl_b32_impl node:$src)]>;
+
def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_cvt_pkrtz node:$src0, node:$src1),
(AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>;
@@ -473,3 +441,23 @@ def AMDGPUmul_u24 : PatFrags<(ops node:$src0, node:$src1),
def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_mul_i24 node:$src0, node:$src1),
(AMDGPUmul_i24_impl node:$src0, node:$src1)]>;
+
+def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),
+ (AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),
+ (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_fmul_legacy node:$src0, node:$src1),
+ (AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp),
+ [(int_amdgcn_fdot2 node:$src0, node:$src1, node:$src2, node:$clamp),
+ (AMDGPUfdot2_impl node:$src0, node:$src1, node:$src2, node:$clamp)]>;
+
+def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc),
+ [(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),
+ (AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c0ea35817ec8e..2025c0fa5d21b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -15,7 +15,6 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -40,6 +39,12 @@
using namespace llvm;
using namespace MIPatternMatch;
+static cl::opt<bool> AllowRiskySelect(
+ "amdgpu-global-isel-risky-select",
+ cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
+ cl::init(false),
+ cl::ReallyHidden);
+
#define GET_GLOBALISEL_IMPL
#define AMDGPUSubtarget GCNSubtarget
#include "AMDGPUGenGlobalISel.inc"
@@ -88,6 +93,30 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
return RB->getID() == AMDGPU::VCCRegBankID;
}
+bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
+ unsigned NewOpc) const {
+ MI.setDesc(TII.get(NewOpc));
+ MI.RemoveOperand(1); // Remove intrinsic ID.
+ MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+
+ MachineOperand &Dst = MI.getOperand(0);
+ MachineOperand &Src = MI.getOperand(1);
+
+ // TODO: This should be legalized to s32 if needed
+ if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
+ return false;
+
+ const TargetRegisterClass *DstRC
+ = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ const TargetRegisterClass *SrcRC
+ = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+ if (!DstRC || DstRC != SrcRC)
+ return false;
+
+ return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
+ RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+}
+
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
@@ -173,6 +202,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
+ if (DefTy == LLT::scalar(1)) {
+ if (!AllowRiskySelect) {
+ LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
+ }
// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
@@ -261,6 +298,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
RC == &AMDGPU::SReg_64RegClass);
I.setDesc(TII.get(InstOpc));
+ // Dead implicit-def of scc
+ I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
+ true, // isImp
+ false, // isKill
+ true)); // isDead
// FIXME: Hack to avoid turning the register bank into a register class.
// The selector for G_ICMP relies on seeing the register bank for the result
@@ -295,7 +337,11 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineFunction *MF = BB->getParent();
Register DstReg = I.getOperand(0).getReg();
const DebugLoc &DL = I.getDebugLoc();
- unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
+ LLT Ty = MRI->getType(DstReg);
+ if (Ty.isVector())
+ return false;
+
+ unsigned Size = Ty.getSizeInBits();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
@@ -445,6 +491,7 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
return true;
}
+// TODO: We should probably legalize these to only using 32-bit results.
bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
Register DstReg = I.getOperand(0).getReg();
@@ -452,11 +499,21 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
LLT DstTy = MRI->getType(DstReg);
LLT SrcTy = MRI->getType(SrcReg);
const unsigned SrcSize = SrcTy.getSizeInBits();
- const unsigned DstSize = DstTy.getSizeInBits();
+ unsigned DstSize = DstTy.getSizeInBits();
// TODO: Should handle any multiple of 32 offset.
unsigned Offset = I.getOperand(2).getImm();
- if (Offset % DstSize != 0)
+ if (Offset % 32 != 0 || DstSize > 128)
+ return false;
+
+ // 16-bit operations really use 32-bit registers.
+ // FIXME: Probably should not allow 16-bit G_EXTRACT results.
+ if (DstSize == 16)
+ DstSize = 32;
+
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
+ if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
return false;
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -464,20 +521,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
if (!SrcRC)
return false;
+ unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
+ DstSize / 32);
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
+ if (!SrcRC)
+ return false;
- ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
-
+ SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
+ *SrcRC, I.getOperand(1));
const DebugLoc &DL = I.getDebugLoc();
- MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
- .addReg(SrcReg, 0, SubRegs[Offset / DstSize]);
+ BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
+ .addReg(SrcReg, 0, SubReg);
- for (const MachineOperand &MO : Copy->operands()) {
- const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, *MRI);
- if (!RC)
- continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
- }
I.eraseFromParent();
return true;
}
@@ -563,6 +618,90 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
return true;
}
+static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
+ int64_t Val;
+ return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
+}
+
+bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
+ MachineInstr &MI) const {
+ if (selectImpl(MI, *CoverageInfo))
+ return true;
+
+ const LLT S32 = LLT::scalar(32);
+ const LLT V2S16 = LLT::vector(2, 16);
+
+ Register Dst = MI.getOperand(0).getReg();
+ if (MRI->getType(Dst) != V2S16)
+ return false;
+
+ const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstBank->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ if (MRI->getType(Src0) != S32)
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *BB = MI.getParent();
+
+ // TODO: This should probably be a combine somewhere
+ // (build_vector_trunc $src0, undef -> copy $src0
+ MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
+ if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+ MI.setDesc(TII.get(AMDGPU::COPY));
+ MI.RemoveOperand(2);
+ return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
+ RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
+ }
+
+ Register ShiftSrc0;
+ Register ShiftSrc1;
+ int64_t ShiftAmt;
+
+ // With multiple uses of the shift, this will duplicate the shift and
+ // increase register pressure.
+ //
+ // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
+ // => (S_PACK_HH_B32_B16 $src0, $src1)
+ // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
+ // => (S_PACK_LH_B32_B16 $src0, $src1)
+ // (build_vector_trunc $src0, $src1)
+ // => (S_PACK_LL_B32_B16 $src0, $src1)
+
+ // FIXME: This is an inconvenient way to check a specific value
+ bool Shift0 = mi_match(
+ Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
+ ShiftAmt == 16;
+
+ bool Shift1 = mi_match(
+ Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
+ ShiftAmt == 16;
+
+ unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
+ if (Shift0 && Shift1) {
+ Opc = AMDGPU::S_PACK_HH_B32_B16;
+ MI.getOperand(1).setReg(ShiftSrc0);
+ MI.getOperand(2).setReg(ShiftSrc1);
+ } else if (Shift1) {
+ Opc = AMDGPU::S_PACK_LH_B32_B16;
+ MI.getOperand(2).setReg(ShiftSrc1);
+ } else if (Shift0 && isZero(Src1, *MRI)) {
+ // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+ .addReg(ShiftSrc0)
+ .addImm(16);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+
+ MI.setDesc(TII.get(Opc));
+ return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
return selectG_ADD_SUB(I);
}
@@ -594,7 +733,9 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
unsigned InsSize = Src1Ty.getSizeInBits();
int64_t Offset = I.getOperand(3).getImm();
- if (Offset % 32 != 0)
+
+ // FIXME: These cases should have been illegal and unnecessary to check here.
+ if (Offset % 32 != 0 || InsSize % 32 != 0)
return false;
unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
@@ -617,7 +758,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
// Deal with weird cases where the class only partially supports the subreg
// index.
Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
- if (!Src0RC)
+ if (!Src0RC || !Src1RC)
return false;
if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
@@ -635,6 +776,85 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
+ if (STI.getLDSBankCount() != 16)
+ return selectImpl(MI, *CoverageInfo);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(2).getReg();
+ Register M0Val = MI.getOperand(6).getReg();
+ if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
+ return false;
+
+ // This requires 2 instructions. It is possible to write a pattern to support
+ // this, but the generated isel emitter doesn't correctly deal with multiple
+ // output instructions using the same physical register input. The copy to m0
+ // is incorrectly placed before the second instruction.
+ //
+ // TODO: Match source modifiers.
+
+ Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0Val);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
+ .addImm(2)
+ .addImm(MI.getOperand(4).getImm()) // $attr
+ .addImm(MI.getOperand(3).getImm()); // $attrchan
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
+ .addImm(0) // $src0_modifiers
+ .addReg(Src0) // $src0
+ .addImm(MI.getOperand(4).getImm()) // $attr
+ .addImm(MI.getOperand(3).getImm()) // $attrchan
+ .addImm(0) // $src2_modifiers
+ .addReg(InterpMov) // $src2 - 2 f16 values selected by high
+ .addImm(MI.getOperand(5).getImm()) // $high
+ .addImm(0) // $clamp
+ .addImm(0); // $omod
+
+ MI.eraseFromParent();
+ return true;
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
+ Register Dst0 = MI.getOperand(0).getReg();
+ Register Dst1 = MI.getOperand(1).getReg();
+
+ LLT Ty = MRI->getType(Dst0);
+ unsigned Opc;
+ if (Ty == LLT::scalar(32))
+ Opc = AMDGPU::V_DIV_SCALE_F32;
+ else if (Ty == LLT::scalar(64))
+ Opc = AMDGPU::V_DIV_SCALE_F64;
+ else
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ Register Numer = MI.getOperand(3).getReg();
+ Register Denom = MI.getOperand(4).getReg();
+ unsigned ChooseDenom = MI.getOperand(5).getImm();
+
+ Register Src0 = ChooseDenom != 0 ? Numer : Denom;
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
+ .addDef(Dst1)
+ .addUse(Src0)
+ .addUse(Denom)
+ .addUse(Numer);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
unsigned IntrinsicID = I.getIntrinsicID();
switch (IntrinsicID) {
@@ -659,6 +879,20 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return true;
}
+ case Intrinsic::amdgcn_interp_p1_f16:
+ return selectInterpP1F16(I);
+ case Intrinsic::amdgcn_wqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::WQM);
+ case Intrinsic::amdgcn_softwqm:
+ return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
+ case Intrinsic::amdgcn_wwm:
+ return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+ case Intrinsic::amdgcn_div_scale:
+ return selectDivScale(I);
+ case Intrinsic::amdgcn_icmp:
+ return selectIntrinsicIcmp(I);
+ case Intrinsic::amdgcn_ballot:
+ return selectBallot(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -779,247 +1013,79 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
return Ret;
}
-static MachineInstr *
-buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
- unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
- unsigned VM, bool Compr, unsigned Enabled, bool Done) {
- const DebugLoc &DL = Insert->getDebugLoc();
- MachineBasicBlock &BB = *Insert->getParent();
- unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
- return BuildMI(BB, Insert, DL, TII.get(Opcode))
- .addImm(Tgt)
- .addReg(Reg0)
- .addReg(Reg1)
- .addReg(Reg2)
- .addReg(Reg3)
- .addImm(VM)
- .addImm(Compr)
- .addImm(Enabled);
-}
-
-static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
- int64_t C;
- if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
- return true;
-
- // FIXME: matcher should ignore copies
- return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
-}
+bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
+ Register Dst = I.getOperand(0).getReg();
+ if (isVCC(Dst, *MRI))
+ return false;
-static unsigned extractGLC(unsigned AuxiliaryData) {
- return AuxiliaryData & 1;
-}
+ if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
+ return false;
-static unsigned extractSLC(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 1) & 1;
-}
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register SrcReg = I.getOperand(2).getReg();
+ unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
+ auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
-static unsigned extractDLC(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 2) & 1;
-}
+ int Opcode = getV_CMPOpcode(Pred, Size);
+ if (Opcode == -1)
+ return false;
-static unsigned extractSWZ(unsigned AuxiliaryData) {
- return (AuxiliaryData >> 3) & 1;
+ MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
+ *MRI);
+ bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+ I.eraseFromParent();
+ return Ret;
}
-static unsigned getBufferStoreOpcode(LLT Ty,
- const unsigned MemSize,
- const bool Offen) {
- const int Size = Ty.getSizeInBits();
- switch (8 * MemSize) {
- case 8:
- return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
- AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
- case 16:
- return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
- AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
- default:
- unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
- AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
- if (Size > 32)
- Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
- return Opc;
- }
-}
-
-static unsigned getBufferStoreFormatOpcode(LLT Ty,
- const unsigned MemSize,
- const bool Offen) {
- bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
- bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
- int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
-
- if (IsD16Packed) {
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
- default:
- return -1;
- }
- }
-
- if (IsD16Unpacked) {
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
- default:
- return -1;
- }
- }
-
- switch (NumElts) {
- case 1:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
- case 2:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
- case 3:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
- case 4:
- return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
- AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
- default:
- return -1;
- }
+bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register DstReg = I.getOperand(0).getReg();
+ const unsigned Size = MRI->getType(DstReg).getSizeInBits();
+ const bool Is64 = Size == 64;
- llvm_unreachable("unhandled buffer store");
-}
-
-// TODO: Move this to combiner
-// Returns base register, imm offset, total constant offset.
-std::tuple<Register, unsigned, unsigned>
-AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
- Register OrigOffset) const {
- const unsigned MaxImm = 4095;
- Register BaseReg;
- unsigned TotalConstOffset;
- MachineInstr *OffsetDef;
-
- std::tie(BaseReg, TotalConstOffset, OffsetDef)
- = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset);
-
- unsigned ImmOffset = TotalConstOffset;
-
- // If the immediate value is too big for the immoffset field, put the value
- // and -4096 into the immoffset field so that the value that is copied/added
- // for the voffset field is a multiple of 4096, and it stands more chance
- // of being CSEd with the copy/add for another similar load/store.f
- // However, do not do that rounding down to a multiple of 4096 if that is a
- // negative number, as it appears to be illegal to have a negative offset
- // in the vgpr, even if adding the immediate offset makes it positive.
- unsigned Overflow = ImmOffset & ~MaxImm;
- ImmOffset -= Overflow;
- if ((int32_t)Overflow < 0) {
- Overflow += ImmOffset;
- ImmOffset = 0;
- }
-
- if (Overflow != 0) {
- // In case this is in a waterfall loop, insert offset code at the def point
- // of the offset, not inside the loop.
- MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
- MachineBasicBlock &OldMBB = B.getMBB();
- B.setInstr(*OffsetDef);
-
- if (!BaseReg) {
- BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(BaseReg)
- .addImm(Overflow);
- } else {
- Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- B.buildInstr(AMDGPU::V_MOV_B32_e32)
- .addDef(OverflowVal)
- .addImm(Overflow);
-
- Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
- .addReg(BaseReg)
- .addReg(OverflowVal, RegState::Kill)
- .addImm(0);
- BaseReg = NewBaseReg;
- }
+ if (Size != STI.getWavefrontSize())
+ return false;
- B.setInsertPt(OldMBB, OldInsPt);
+ Optional<ValueAndVReg> Arg =
+ getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
+
+ if (Arg.hasValue()) {
+ const int64_t Value = Arg.getValue().Value;
+ if (Value == 0) {
+ unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
+ } else if (Value == -1) { // all ones
+ Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
+ } else
+ return false;
+ } else {
+ Register SrcReg = I.getOperand(2).getReg();
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
}
- return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+ I.eraseFromParent();
+ return true;
}
-bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
- bool IsFormat) const {
- MachineIRBuilder B(MI);
- MachineFunction &MF = B.getMF();
- Register VData = MI.getOperand(1).getReg();
- LLT Ty = MRI->getType(VData);
-
- int Size = Ty.getSizeInBits();
- if (Size % 32 != 0)
- return false;
-
- // FIXME: Verifier should enforce 1 MMO for these intrinsics.
- MachineMemOperand *MMO = *MI.memoperands_begin();
- const int MemSize = MMO->getSize();
-
- Register RSrc = MI.getOperand(2).getReg();
- Register VOffset = MI.getOperand(3).getReg();
- Register SOffset = MI.getOperand(4).getReg();
- unsigned AuxiliaryData = MI.getOperand(5).getImm();
- unsigned ImmOffset;
- unsigned TotalOffset;
-
- std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
- if (TotalOffset != 0)
- MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
-
- const bool Offen = !isZero(VOffset, *MRI);
-
- int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
- getBufferStoreOpcode(Ty, MemSize, Offen);
- if (Opc == -1)
- return false;
-
- MachineInstrBuilder MIB = B.buildInstr(Opc)
- .addUse(VData);
-
- if (Offen)
- MIB.addUse(VOffset);
-
- MIB.addUse(RSrc)
- .addUse(SOffset)
- .addImm(ImmOffset)
- .addImm(extractGLC(AuxiliaryData))
- .addImm(extractSLC(AuxiliaryData))
- .addImm(0) // tfe: FIXME: Remove from inst
- .addImm(extractDLC(AuxiliaryData))
- .addImm(extractSWZ(AuxiliaryData))
- .addMemOperand(MMO);
+bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
+ // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+ // SelectionDAG uses for wave32 vs wave64.
+ MachineBasicBlock *BB = MI.getParent();
+ BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
+ .add(MI.getOperand(1));
+ Register Reg = MI.getOperand(1).getReg();
MI.eraseFromParent();
- return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ if (!MRI->getRegClassOrNull(Reg))
+ MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
+ return true;
}
static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
@@ -1106,70 +1172,458 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
return Ret;
}
-bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
- MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- unsigned IntrinsicID = I.getIntrinsicID();
- switch (IntrinsicID) {
- case Intrinsic::amdgcn_exp: {
- int64_t Tgt = I.getOperand(1).getImm();
- int64_t Enabled = I.getOperand(2).getImm();
- int64_t Done = I.getOperand(7).getImm();
- int64_t VM = I.getOperand(8).getImm();
-
- MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
- I.getOperand(4).getReg(),
- I.getOperand(5).getReg(),
- I.getOperand(6).getReg(),
- VM, false, Enabled, Done);
+static unsigned gwsIntrinToOpcode(unsigned IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ return AMDGPU::DS_GWS_INIT;
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ return AMDGPU::DS_GWS_BARRIER;
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ return AMDGPU::DS_GWS_SEMA_V;
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ return AMDGPU::DS_GWS_SEMA_BR;
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ return AMDGPU::DS_GWS_SEMA_P;
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
+ default:
+ llvm_unreachable("not a gws intrinsic");
+ }
+}
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
+ Intrinsic::ID IID) const {
+ if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+ !STI.hasGWSSemaReleaseAll())
+ return false;
+
+ // intrinsic ID, vsrc, offset
+ const bool HasVSrc = MI.getNumOperands() == 3;
+ assert(HasVSrc || MI.getNumOperands() == 2);
+
+ Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
+ const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
+ if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
+ assert(OffsetDef);
+
+ unsigned ImmOffset;
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineInstr *Readfirstlane = nullptr;
+
+ // If we legalized the VGPR input, strip out the readfirstlane to analyze the
+ // incoming offset, in case there's an add of a constant. We'll have to put it
+ // back later.
+ if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
+ Readfirstlane = OffsetDef;
+ BaseOffset = OffsetDef->getOperand(1).getReg();
+ OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
}
- case Intrinsic::amdgcn_exp_compr: {
- const DebugLoc &DL = I.getDebugLoc();
- int64_t Tgt = I.getOperand(1).getImm();
- int64_t Enabled = I.getOperand(2).getImm();
- Register Reg0 = I.getOperand(3).getReg();
- Register Reg1 = I.getOperand(4).getReg();
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- int64_t Done = I.getOperand(5).getImm();
- int64_t VM = I.getOperand(6).getImm();
-
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
- MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
- true, Enabled, Done);
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+ if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
+ // If we have a constant offset, try to use the 0 in m0 as the base.
+ // TODO: Look into changing the default m0 initialization value. If the
+ // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
+ // the immediate offset.
+
+ ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addImm(0);
+ } else {
+ std::tie(BaseOffset, ImmOffset, OffsetDef)
+ = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
+
+ if (Readfirstlane) {
+ // We have the constant offset now, so put the readfirstlane back on the
+ // variable component.
+ if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
+ return false;
+
+ Readfirstlane->getOperand(1).setReg(BaseOffset);
+ BaseOffset = Readfirstlane->getOperand(0).getReg();
+ } else {
+ if (!RBI.constrainGenericRegister(BaseOffset,
+ AMDGPU::SReg_32RegClass, *MRI))
+ return false;
+ }
+
+ Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
+ .addReg(BaseOffset)
+ .addImm(16);
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0Base);
}
- case Intrinsic::amdgcn_end_cf: {
- // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
- // SelectionDAG uses for wave32 vs wave64.
- BuildMI(*BB, &I, I.getDebugLoc(),
- TII.get(AMDGPU::SI_END_CF))
- .add(I.getOperand(1));
- Register Reg = I.getOperand(1).getReg();
- I.eraseFromParent();
+ // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+ // offset field) % 64. Some versions of the programming guide omit the m0
+ // part, or claim it's from offset 0.
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
- if (!MRI->getRegClassOrNull(Reg))
- MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
- return true;
+ if (HasVSrc) {
+ Register VSrc = MI.getOperand(1).getReg();
+ MIB.addReg(VSrc);
+ if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
+ return false;
+ }
+
+ MIB.addImm(ImmOffset)
+ .addImm(-1) // $gds
+ .cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
+ bool IsAppend) const {
+ Register PtrBase = MI.getOperand(2).getReg();
+ LLT PtrTy = MRI->getType(PtrBase);
+ bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+ unsigned Offset;
+ std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
+
+ // TODO: Should this try to look through readfirstlane like GWS?
+ if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+ PtrBase = MI.getOperand(2).getReg();
+ Offset = 0;
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(PtrBase);
+ BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
+ .addImm(Offset)
+ .addImm(IsGDS ? -1 : 0)
+ .cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return true;
+}
+
+static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
+ bool &IsTexFail) {
+ if (TexFailCtrl)
+ IsTexFail = true;
+
+ TFE = (TexFailCtrl & 0x1) ? 1 : 0;
+ TexFailCtrl &= ~(uint64_t)0x1;
+ LWE = (TexFailCtrl & 0x2) ? 1 : 0;
+ TexFailCtrl &= ~(uint64_t)0x2;
+
+ return TexFailCtrl == 0;
+}
+
+static bool parseCachePolicy(uint64_t Value,
+ bool *GLC, bool *SLC, bool *DLC) {
+ if (GLC) {
+ *GLC = (Value & 0x1) ? 1 : 0;
+ Value &= ~(uint64_t)0x1;
+ }
+ if (SLC) {
+ *SLC = (Value & 0x2) ? 1 : 0;
+ Value &= ~(uint64_t)0x2;
+ }
+ if (DLC) {
+ *DLC = (Value & 0x4) ? 1 : 0;
+ Value &= ~(uint64_t)0x4;
+ }
+
+ return Value == 0;
+}
+
+bool AMDGPUInstructionSelector::selectImageIntrinsic(
+ MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+ const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
+ AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
+ unsigned IntrOpcode = Intr->BaseOpcode;
+ const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
+
+ const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
+ MI.getNumExplicitDefs());
+ int NumVAddr, NumGradients;
+ std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
+
+ Register VDataIn, VDataOut;
+ LLT VDataTy;
+ int NumVDataDwords = -1;
+ bool IsD16 = false;
+
+ // XXX - Can we just get the second to last argument for ctrl?
+ unsigned CtrlIdx; // Index of texfailctrl argument
+ bool Unorm;
+ if (!BaseOpcode->Sampler) {
+ Unorm = true;
+ CtrlIdx = VAddrIdx + NumVAddr + 1;
+ } else {
+ Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
+ CtrlIdx = VAddrIdx + NumVAddr + 3;
+ }
+
+ bool TFE;
+ bool LWE;
+ bool IsTexFail = false;
+ if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
+ return false;
+
+ const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
+ const bool IsA16 = (Flags & 1) != 0;
+ const bool IsG16 = (Flags & 2) != 0;
+
+ // A16 implies 16 bit gradients
+ if (IsA16 && !IsG16)
+ return false;
+
+ unsigned DMask = 0;
+ unsigned DMaskLanes = 0;
+
+ if (BaseOpcode->Atomic) {
+ VDataOut = MI.getOperand(0).getReg();
+ VDataIn = MI.getOperand(2).getReg();
+ LLT Ty = MRI->getType(VDataIn);
+
+ // Be careful to allow atomic swap on 16-bit element vectors.
+ const bool Is64Bit = BaseOpcode->AtomicX2 ?
+ Ty.getSizeInBits() == 128 :
+ Ty.getSizeInBits() == 64;
+
+ if (BaseOpcode->AtomicX2) {
+ assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
+
+ DMask = Is64Bit ? 0xf : 0x3;
+ NumVDataDwords = Is64Bit ? 4 : 2;
+ } else {
+ DMask = Is64Bit ? 0x3 : 0x1;
+ NumVDataDwords = Is64Bit ? 2 : 1;
+ }
+ } else {
+ const int DMaskIdx = 2; // Input/output + intrinsic ID.
+
+ DMask = MI.getOperand(DMaskIdx).getImm();
+ DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
+
+ if (BaseOpcode->Store) {
+ VDataIn = MI.getOperand(1).getReg();
+ VDataTy = MRI->getType(VDataIn);
+ NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
+ } else {
+ VDataOut = MI.getOperand(0).getReg();
+ VDataTy = MRI->getType(VDataOut);
+ NumVDataDwords = DMaskLanes;
+
+ // One memoperand is mandatory, except for getresinfo.
+ // FIXME: Check this in verifier.
+ if (!MI.memoperands_empty()) {
+ const MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ // Infer d16 from the memory size, as the register type will be mangled by
+ // unpacked subtargets, or by TFE.
+ IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
+
+ if (IsD16 && !STI.hasUnpackedD16VMem())
+ NumVDataDwords = (DMaskLanes + 1) / 2;
+ }
+ }
+ }
+
+ // Optimize _L to _LZ when _L is zero
+ if (LZMappingInfo) {
+ // The legalizer replaced the register with an immediate 0 if we need to
+ // change the opcode.
+ const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+ if (Lod.isImm()) {
+ assert(Lod.getImm() == 0);
+ IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
+ }
+ }
+
+ // Optimize _mip away, when 'lod' is zero
+ if (MIPMappingInfo) {
+ const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+ if (Lod.isImm()) {
+ assert(Lod.getImm() == 0);
+ IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
+ }
+ }
+
+ // Set G16 opcode
+ if (IsG16 && !IsA16) {
+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+ assert(G16MappingInfo);
+ IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
+ }
+
+ // TODO: Check this in verifier.
+ assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
+
+ bool GLC = false;
+ bool SLC = false;
+ bool DLC = false;
+ if (BaseOpcode->Atomic) {
+ GLC = true; // TODO no-return optimization
+ if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
+ IsGFX10 ? &DLC : nullptr))
+ return false;
+ } else {
+ if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
+ IsGFX10 ? &DLC : nullptr))
+ return false;
+ }
+
+ int NumVAddrRegs = 0;
+ int NumVAddrDwords = 0;
+ for (int I = 0; I < NumVAddr; ++I) {
+ // Skip the $noregs and 0s inserted during legalization.
+ MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
+ if (!AddrOp.isReg())
+ continue; // XXX - Break?
+
+ Register Addr = AddrOp.getReg();
+ if (!Addr)
+ break;
+
+ ++NumVAddrRegs;
+ NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
+ }
+
+ // The legalizer preprocessed the intrinsic arguments. If we aren't using
+ // NSA, these should have beeen packed into a single value in the first
+ // address register
+ const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
+ if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
+ LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
+ return false;
+ }
+
+ if (IsTexFail)
+ ++NumVDataDwords;
+
+ int Opcode = -1;
+ if (IsGFX10) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+ UseNSA ? AMDGPU::MIMGEncGfx10NSA
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, NumVAddrDwords);
+ } else {
+ if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ }
+ assert(Opcode != -1);
+
+ auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
+ .cloneMemRefs(MI);
+
+ if (VDataOut) {
+ if (BaseOpcode->AtomicX2) {
+ const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
+
+ Register TmpReg = MRI->createVirtualRegister(
+ Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
+ unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+
+ MIB.addDef(TmpReg);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+
+ } else {
+ MIB.addDef(VDataOut); // vdata output
+ }
}
- case Intrinsic::amdgcn_raw_buffer_store:
- return selectStoreIntrinsic(I, false);
- case Intrinsic::amdgcn_raw_buffer_store_format:
- return selectStoreIntrinsic(I, true);
+
+ if (VDataIn)
+ MIB.addReg(VDataIn); // vdata input
+
+ for (int i = 0; i != NumVAddrRegs; ++i) {
+ MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
+ if (SrcOp.isReg()) {
+ assert(SrcOp.getReg() != 0);
+ MIB.addReg(SrcOp.getReg());
+ }
+ }
+
+ MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
+ if (BaseOpcode->Sampler)
+ MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
+
+ MIB.addImm(DMask); // dmask
+
+ if (IsGFX10)
+ MIB.addImm(DimInfo->Encoding);
+ MIB.addImm(Unorm);
+ if (IsGFX10)
+ MIB.addImm(DLC);
+
+ MIB.addImm(GLC);
+ MIB.addImm(SLC);
+ MIB.addImm(IsA16 && // a16 or r128
+ STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
+ if (IsGFX10)
+ MIB.addImm(IsA16 ? -1 : 0);
+
+ MIB.addImm(TFE); // tfe
+ MIB.addImm(LWE); // lwe
+ if (!IsGFX10)
+ MIB.addImm(DimInfo->DA ? -1 : 0);
+ if (BaseOpcode->HasD16)
+ MIB.addImm(IsD16 ? -1 : 0);
+
+ MI.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+ MachineInstr &I) const {
+ unsigned IntrinsicID = I.getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_end_cf:
+ return selectEndCfIntrinsic(I);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
- default:
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_br:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all:
+ return selectDSGWSIntrinsic(I, IntrinsicID);
+ case Intrinsic::amdgcn_ds_append:
+ return selectDSAppendConsume(I, true);
+ case Intrinsic::amdgcn_ds_consume:
+ return selectDSAppendConsume(I, false);
+ default: {
return selectImpl(I, *CoverageInfo);
}
+ }
}
bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -1247,9 +1701,6 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- if (!DstTy.isScalar())
- return false;
-
const LLT S1 = LLT::scalar(1);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -1264,6 +1715,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
}
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
@@ -1271,6 +1724,73 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
= TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
const TargetRegisterClass *DstRC
= TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+ if (!SrcRC || !DstRC)
+ return false;
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ Register LoReg = MRI->createVirtualRegister(DstRC);
+ Register HiReg = MRI->createVirtualRegister(DstRC);
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(SrcReg, 0, AMDGPU::sub0);
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(SrcReg, 0, AMDGPU::sub1);
+
+ if (IsVALU && STI.hasSDWA()) {
+ // Write the low 16-bits of the high element into the high 16-bits of the
+ // low element.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(HiReg) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(LoReg, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ Register TmpReg0 = MRI->createVirtualRegister(DstRC);
+ Register TmpReg1 = MRI->createVirtualRegister(DstRC);
+ Register ImmReg = MRI->createVirtualRegister(DstRC);
+ if (IsVALU) {
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
+ .addImm(16)
+ .addReg(HiReg);
+ } else {
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addReg(HiReg)
+ .addImm(16);
+ }
+
+ unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+ unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
+
+ BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
+ .addImm(0xffff);
+ BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
+ .addReg(LoReg)
+ .addReg(ImmReg);
+ BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
+ .addReg(TmpReg0)
+ .addReg(TmpReg1);
+ }
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ if (!DstTy.isScalar())
+ return false;
if (SrcSize > 32) {
int SubRegIdx = sizeToSubRegIndex(DstSize);
@@ -1279,17 +1799,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
// Deal with weird cases where the class only partially supports the subreg
// index.
- SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
- if (!SrcRC)
+ const TargetRegisterClass *SrcWithSubRC
+ = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
+ if (!SrcWithSubRC)
return false;
- I.getOperand(1).setSubReg(SubRegIdx);
- }
+ if (SrcWithSubRC != SrcRC) {
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
+ return false;
+ }
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
- return false;
+ I.getOperand(1).setSubReg(SubRegIdx);
}
I.setDesc(TII.get(TargetOpcode::COPY));
@@ -1318,7 +1838,8 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
}
bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
- bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
+ bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
+ bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock &MBB = *I.getParent();
const Register DstReg = I.getOperand(0).getReg();
@@ -1326,7 +1847,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- const unsigned SrcSize = SrcTy.getSizeInBits();
+ const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
+ I.getOperand(2).getImm() : SrcTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
if (!DstTy.isScalar())
return false;
@@ -1362,7 +1884,9 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
}
if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
- if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
+ const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
+ AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
+ if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
return false;
if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
@@ -1378,13 +1902,15 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
// Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
- if (DstSize > 32 && SrcSize <= 32) {
+ if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
// We need a 64-bit register source, but the high bits don't matter.
Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
+
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
- .addReg(SrcReg)
+ .addReg(SrcReg, 0, SubReg)
.addImm(AMDGPU::sub0)
.addReg(UndefReg)
.addImm(AMDGPU::sub1);
@@ -1487,6 +2013,103 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
}
+bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
+ // Only manually handle the f64 SGPR case.
+ //
+ // FIXME: This is a workaround for 2.5 different tablegen problems. Because
+ // the bit ops theoretically have a second result due to the implicit def of
+ // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
+ // that is easy by disabling the check. The result works, but uses a
+ // nonsensical sreg32orlds_and_sreg_1 regclass.
+ //
+ // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
+ // the variadic REG_SEQUENCE operands.
+
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
+ MRI->getType(Dst) != LLT::scalar(64))
+ return false;
+
+ Register Src = MI.getOperand(1).getReg();
+ MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
+ if (Fabs)
+ Src = Fabs->getOperand(1).getReg();
+
+ if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
+ return false;
+
+ MachineBasicBlock *BB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(Src, 0, AMDGPU::sub0);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(Src, 0, AMDGPU::sub1);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
+ .addImm(0x80000000);
+
+ // Set or toggle sign bit.
+ unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
+ BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
+ .addReg(HiReg)
+ .addReg(ConstReg);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(OpReg)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return true;
+}
+
+// FIXME: This is a workaround for the same tablegen problems as G_FNEG
+bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
+ MRI->getType(Dst) != LLT::scalar(64))
+ return false;
+
+ Register Src = MI.getOperand(1).getReg();
+ MachineBasicBlock *BB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
+ !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
+ return false;
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(Src, 0, AMDGPU::sub0);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(Src, 0, AMDGPU::sub1);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
+ .addImm(0x7fffffff);
+
+ // Clear sign bit.
+ // TODO: Should this used S_BITSET0_*?
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
+ .addReg(HiReg)
+ .addReg(ConstReg);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(OpReg)
+ .addImm(AMDGPU::sub1);
+
+ MI.eraseFromParent();
+ return true;
+}
+
static bool isConstant(const MachineInstr &MI) {
return MI.getOpcode() == TargetOpcode::G_CONSTANT;
}
@@ -1573,6 +2196,65 @@ bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
return selectImpl(I, *CoverageInfo);
}
+// TODO: No rtn optimization.
+bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
+ MachineInstr &MI) const {
+ Register PtrReg = MI.getOperand(1).getReg();
+ const LLT PtrTy = MRI->getType(PtrReg);
+ if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+ STI.useFlatForGlobal())
+ return selectImpl(MI, *CoverageInfo);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ const LLT Ty = MRI->getType(DstReg);
+ const bool Is64 = Ty.getSizeInBits() == 64;
+ const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ Register TmpReg = MRI->createVirtualRegister(
+ Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock *BB = MI.getParent();
+
+ Register VAddr, RSrcReg, SOffset;
+ int64_t Offset = 0;
+
+ unsigned Opcode;
+ if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
+ Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
+ } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
+ RSrcReg, SOffset, Offset)) {
+ Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
+ AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
+ } else
+ return selectImpl(MI, *CoverageInfo);
+
+ auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
+ .addReg(MI.getOperand(2).getReg());
+
+ if (VAddr)
+ MIB.addReg(VAddr);
+
+ MIB.addReg(RSrcReg);
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+
+ MIB.addImm(Offset);
+ MIB.addImm(0); // slc
+ MIB.cloneMemRefs(MI);
+
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(TmpReg, RegState::Kill, SubReg);
+
+ MI.eraseFromParent();
+
+ MRI->setRegClass(
+ DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineOperand &CondOp = I.getOperand(0);
@@ -1619,7 +2301,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
return true;
}
-bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
+ MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
@@ -1631,67 +2314,134 @@ bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
}
-bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
- uint64_t Align = I.getOperand(2).getImm();
- const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
-
- MachineBasicBlock *BB = I.getParent();
-
+bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
+ Register MaskReg = I.getOperand(2).getReg();
+ LLT Ty = MRI->getType(DstReg);
+ LLT MaskTy = MRI->getType(MaskReg);
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
+ const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ if (DstRB != SrcRB) // Should only happen for hand written MIR.
+ return false;
+
unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
- unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
const TargetRegisterClass &RegRC
= IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
- LLT Ty = MRI->getType(DstReg);
-
const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
*MRI);
const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
*MRI);
+ const TargetRegisterClass *MaskRC =
+ TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
+
if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
- !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
return false;
+ MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- Register ImmReg = MRI->createVirtualRegister(&RegRC);
- BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg)
- .addImm(Mask);
-
if (Ty.getSizeInBits() == 32) {
+ assert(MaskTy.getSizeInBits() == 32 &&
+ "ptrmask should have been narrowed during legalize");
+
BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
.addReg(SrcReg)
- .addReg(ImmReg);
+ .addReg(MaskReg);
I.eraseFromParent();
return true;
}
Register HiReg = MRI->createVirtualRegister(&RegRC);
Register LoReg = MRI->createVirtualRegister(&RegRC);
- Register MaskLo = MRI->createVirtualRegister(&RegRC);
+ // Extract the subregisters from the source pointer.
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
.addReg(SrcReg, 0, AMDGPU::sub0);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
.addReg(SrcReg, 0, AMDGPU::sub1);
- BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo)
- .addReg(LoReg)
- .addReg(ImmReg);
+ Register MaskedLo, MaskedHi;
+
+ // Try to avoid emitting a bit operation when we only need to touch half of
+ // the 64-bit pointer.
+ APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+
+ const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
+ const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
+ if ((MaskOnes & MaskLo32) == MaskLo32) {
+ // If all the bits in the low half are 1, we only need a copy for it.
+ MaskedLo = LoReg;
+ } else {
+ // Extract the mask subregister and apply the and.
+ Register MaskLo = MRI->createVirtualRegister(&RegRC);
+ MaskedLo = MRI->createVirtualRegister(&RegRC);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
+ .addReg(MaskReg, 0, AMDGPU::sub0);
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
+ .addReg(LoReg)
+ .addReg(MaskLo);
+ }
+
+ if ((MaskOnes & MaskHi32) == MaskHi32) {
+ // If all the bits in the high half are 1, we only need a copy for it.
+ MaskedHi = HiReg;
+ } else {
+ Register MaskHi = MRI->createVirtualRegister(&RegRC);
+ MaskedHi = MRI->createVirtualRegister(&RegRC);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
+ .addReg(MaskReg, 0, AMDGPU::sub1);
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
+ .addReg(HiReg)
+ .addReg(MaskHi);
+ }
+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(MaskLo)
+ .addReg(MaskedLo)
.addImm(AMDGPU::sub0)
- .addReg(HiReg)
+ .addReg(MaskedHi)
.addImm(AMDGPU::sub1);
I.eraseFromParent();
return true;
}
+/// Return the register to use for the index value, and the subregister to use
+/// for the indirectly accessed register.
+static std::pair<Register, unsigned>
+computeIndirectRegIndex(MachineRegisterInfo &MRI,
+ const SIRegisterInfo &TRI,
+ const TargetRegisterClass *SuperRC,
+ Register IdxReg,
+ unsigned EltSize) {
+ Register IdxBaseReg;
+ int Offset;
+ MachineInstr *Unused;
+
+ std::tie(IdxBaseReg, Offset, Unused)
+ = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
+ if (IdxBaseReg == AMDGPU::NoRegister) {
+ // This will happen if the index is a known constant. This should ordinarily
+ // be legalized out, but handle it as a register just in case.
+ assert(Offset == 0);
+ IdxBaseReg = IdxReg;
+ }
+
+ ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
+
+ // Skip out of bounds offsets, or else we would end up using an undefined
+ // register.
+ if (static_cast<unsigned>(Offset) >= SubRegs.size())
+ return std::make_pair(IdxReg, SubRegs[0]);
+ return std::make_pair(IdxBaseReg, SubRegs[Offset]);
+}
+
bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
MachineInstr &MI) const {
Register DstReg = MI.getOperand(0).getReg();
@@ -1714,6 +2464,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
*MRI);
const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
*MRI);
+ if (!SrcRC || !DstRC)
+ return false;
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
!RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -1723,7 +2475,9 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
const DebugLoc &DL = MI.getDebugLoc();
const bool Is64 = DstTy.getSizeInBits() == 64;
- unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+ unsigned SubReg;
+ std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
+ DstTy.getSizeInBits() / 8);
if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
if (DstTy.getSizeInBits() != 32 && !Is64)
@@ -1766,6 +2520,237 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
return true;
}
+// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
+bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
+ MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register VecReg = MI.getOperand(1).getReg();
+ Register ValReg = MI.getOperand(2).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
+
+ LLT VecTy = MRI->getType(DstReg);
+ LLT ValTy = MRI->getType(ValReg);
+ unsigned VecSize = VecTy.getSizeInBits();
+ unsigned ValSize = ValTy.getSizeInBits();
+
+ const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
+ const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
+ const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
+
+ assert(VecTy.getElementType() == ValTy);
+
+ // The index must be scalar. If it wasn't RegBankSelect should have moved this
+ // into a waterfall loop.
+ if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
+ *MRI);
+ const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
+ *MRI);
+
+ if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
+ !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
+ !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
+ return false;
+
+ if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
+ return false;
+
+ unsigned SubReg;
+ std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
+ ValSize / 8);
+
+ const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
+ STI.useVGPRIndexMode();
+
+ MachineBasicBlock *BB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (IndexMode) {
+ BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(IdxReg)
+ .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
+ } else {
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(IdxReg);
+ }
+
+ const MCInstrDesc &RegWriteOp
+ = TII.getIndirectRegWritePseudo(VecSize, ValSize,
+ VecRB->getID() == AMDGPU::SGPRRegBankID);
+ BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
+ .addReg(VecReg)
+ .addReg(ValReg)
+ .addImm(SubReg);
+
+ if (IndexMode)
+ BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+ MI.eraseFromParent();
+ return true;
+}
+
+static bool isZeroOrUndef(int X) {
+ return X == 0 || X == -1;
+}
+
+static bool isOneOrUndef(int X) {
+ return X == 1 || X == -1;
+}
+
+static bool isZeroOrOneOrUndef(int X) {
+ return X == 0 || X == 1 || X == -1;
+}
+
+// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
+// 32-bit register.
+static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
+ ArrayRef<int> Mask) {
+ NewMask[0] = Mask[0];
+ NewMask[1] = Mask[1];
+ if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
+ return Src0;
+
+ assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
+ assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
+
+ // Shift the mask inputs to be 0/1;
+ NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
+ NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
+ return Src1;
+}
+
+// This is only legal with VOP3P instructions as an aid to op_sel matching.
+bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
+ MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0Reg = MI.getOperand(1).getReg();
+ Register Src1Reg = MI.getOperand(2).getReg();
+ ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
+
+ const LLT V2S16 = LLT::vector(2, 16);
+ if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
+ return false;
+
+ if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
+ return false;
+
+ assert(ShufMask.size() == 2);
+ assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
+
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ const TargetRegisterClass &RC = IsVALU ?
+ AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+
+ // Handle the degenerate case which should have folded out.
+ if (ShufMask[0] == -1 && ShufMask[1] == -1) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
+
+ MI.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, RC, *MRI);
+ }
+
+ // A legal VOP3P mask only reads one of the sources.
+ int Mask[2];
+ Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
+
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
+ !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
+ return false;
+
+ // TODO: This also should have been folded out
+ if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(SrcVec);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (Mask[0] == 1 && Mask[1] == -1) {
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+ .addImm(16)
+ .addReg(SrcVec);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ }
+ } else if (Mask[0] == -1 && Mask[1] == 0) {
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
+ .addImm(16)
+ .addReg(SrcVec);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ }
+ } else if (Mask[0] == 0 && Mask[1] == 0) {
+ if (IsVALU) {
+ // Write low half of the register into the high half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec);
+ }
+ } else if (Mask[0] == 1 && Mask[1] == 1) {
+ if (IsVALU) {
+ // Write high half of the register into the low half.
+ MachineInstr *MovSDWA =
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+ .addImm(0) // $src0_modifiers
+ .addReg(SrcVec) // $src0
+ .addImm(0) // $clamp
+ .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
+ .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+ .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
+ .addReg(SrcVec, RegState::Implicit);
+ MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec);
+ }
+ } else if (Mask[0] == 1 && Mask[1] == 0) {
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
+ .addReg(SrcVec)
+ .addReg(SrcVec)
+ .addImm(16);
+ } else {
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
+ .addReg(SrcVec)
+ .addImm(16);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+ .addReg(TmpReg)
+ .addReg(SrcVec);
+ }
+ } else
+ llvm_unreachable("all shuffle masks should be handled");
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@@ -1780,9 +2765,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR:
- if (selectG_AND_OR_XOR(I))
+ if (selectImpl(I, *CoverageInfo))
return true;
- return selectImpl(I, *CoverageInfo);
+ return selectG_AND_OR_XOR(I);
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
if (selectImpl(I, *CoverageInfo))
@@ -1800,6 +2785,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
return selectG_CONSTANT(I);
+ case TargetOpcode::G_FNEG:
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+ return selectG_FNEG(I);
+ case TargetOpcode::G_FABS:
+ if (selectImpl(I, *CoverageInfo))
+ return true;
+ return selectG_FABS(I);
case TargetOpcode::G_EXTRACT:
return selectG_EXTRACT(I);
case TargetOpcode::G_MERGE_VALUES:
@@ -1808,6 +2801,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_MERGE_VALUES(I);
case TargetOpcode::G_UNMERGE_VALUES:
return selectG_UNMERGE_VALUES(I);
+ case TargetOpcode::G_BUILD_VECTOR_TRUNC:
+ return selectG_BUILD_VECTOR_TRUNC(I);
case TargetOpcode::G_PTR_ADD:
return selectG_PTR_ADD(I);
case TargetOpcode::G_IMPLICIT_DEF:
@@ -1836,6 +2831,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_FADD:
return selectG_LOAD_ATOMICRMW(I);
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
+ return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
case TargetOpcode::G_STORE:
@@ -1845,17 +2842,34 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_SEXT:
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_ANYEXT:
+ case TargetOpcode::G_SEXT_INREG:
if (selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
case TargetOpcode::G_FRAME_INDEX:
- return selectG_FRAME_INDEX(I);
- case TargetOpcode::G_PTR_MASK:
- return selectG_PTR_MASK(I);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
+ case TargetOpcode::G_PTRMASK:
+ return selectG_PTRMASK(I);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return selectG_EXTRACT_VECTOR_ELT(I);
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
+ return selectG_INSERT_VECTOR_ELT(I);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return selectG_SHUFFLE_VECTOR(I);
+ case AMDGPU::G_AMDGPU_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+ initM0(I);
+ return selectImpl(I, *CoverageInfo);
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ const AMDGPU::ImageDimIntrinsicInfo *Intr
+ = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
+ assert(Intr && "not an image intrinsic with image pseudo");
+ return selectImageIntrinsic(I, Intr);
+ }
default:
return selectImpl(I, *CoverageInfo);
}
@@ -1871,15 +2885,16 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(
- Register Src) const {
+AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
+ Register Src = Root.getReg();
+ Register OrigSrc = Src;
unsigned Mods = 0;
- MachineInstr *MI = MRI->getVRegDef(Src);
+ MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
Src = MI->getOperand(1).getReg();
Mods |= SISrcMods::NEG;
- MI = MRI->getVRegDef(Src);
+ MI = getDefIgnoringCopies(Src, *MRI);
}
if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -1887,6 +2902,20 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(
Mods |= SISrcMods::ABS;
}
+ if (Mods != 0 &&
+ RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
+ MachineInstr *UseMI = Root.getParent();
+
+ // If we looked through copies to find source modifiers on an SGPR operand,
+ // we now have an SGPR register source. To avoid potentially violating the
+ // constant bus restriction, we need to insert a copy to a VGPR.
+ Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
+ BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
+ TII.get(AMDGPU::COPY), VGPRSrc)
+ .addReg(Src);
+ Src = VGPRSrc;
+ }
+
return std::make_pair(Src, Mods);
}
@@ -1904,7 +2933,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1927,7 +2956,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1936,12 +2965,48 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
+ Register Reg = Root.getReg();
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
+ Def->getOpcode() == AMDGPU::G_FABS))
+ return {};
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ }};
+}
+
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(
+ Register Src, const MachineRegisterInfo &MRI) const {
+ unsigned Mods = 0;
+ MachineInstr *MI = MRI.getVRegDef(Src);
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
+ // It's possible to see an f32 fneg here, but unlikely.
+ // TODO: Treat f32 fneg as only high bit.
+ MRI.getType(Src) == LLT::vector(2, 16)) {
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ Src = MI->getOperand(1).getReg();
+ MI = MRI.getVRegDef(Src);
+ }
+
+ // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+
+ // Packed instructions do not have abs modifiers.
+ Mods |= SISrcMods::OP_SEL_1;
+
+ return std::make_pair(Src, Mods);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
- if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
- return None;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1950,12 +3015,16 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
- // FIXME: Handle clamp and op_sel
+AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+ if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+ return None;
+
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
@@ -1977,15 +3046,15 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
-
- if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
+ Optional<int64_t> EncodedImm =
+ AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
+ if (!EncodedImm)
return None;
unsigned PtrReg = GEPInfo.SgprParts[0];
- int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
}};
}
@@ -1998,14 +3067,15 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
- unsigned PtrReg = GEPInfo.SgprParts[0];
- int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
- if (!isUInt<32>(EncodedImm))
+ Register PtrReg = GEPInfo.SgprParts[0];
+ Optional<int64_t> EncodedImm =
+ AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
+ if (!EncodedImm)
return None;
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
}};
}
@@ -2023,14 +3093,15 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
- if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
+ // SGPR offset is unsigned.
+ if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
return None;
// If we make it this far we have a load with an 32-bit immediate offset.
// It is OK to select this using a sgpr offset, because we have already
// failed trying to select this load into one of the _IMM variants since
// the _IMM Patterns are considered before the _SGPR patterns.
- unsigned PtrReg = GEPInfo.SgprParts[0];
+ Register PtrReg = GEPInfo.SgprParts[0];
Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(GEPInfo.Imm);
@@ -2099,7 +3170,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
int64_t Offset = 0;
- if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+ if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
+ Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// TODO: Should this be inside the render function? The iterator seems to
@@ -2118,17 +3190,17 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
const MachineMemOperand *MMO = *MI->memoperands_begin();
const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
- Register SOffsetReg = isStackPtrRelative(PtrInfo)
- ? Info->getStackPtrOffsetReg()
- : Info->getScratchWaveOffsetReg();
- MIB.addReg(SOffsetReg);
+ if (isStackPtrRelative(PtrInfo))
+ MIB.addReg(Info->getStackPtrOffsetReg());
+ else
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset & 4095);
}}};
}
- assert(Offset == 0);
+ assert(Offset == 0 || Offset == -1);
// Try to fold a frame index directly into the MUBUF vaddr field, and any
// offsets.
@@ -2158,13 +3230,6 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
}
}
- // If we don't know this private access is a local stack object, it needs to
- // be relative to the entry point's scratch wave offset register.
- // TODO: Should split large offsets that don't fit like above.
- // TODO: Don't use scratch wave offset just because the offset didn't fit.
- Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg()
- : Info->getScratchWaveOffsetReg();
-
return {{[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
},
@@ -2175,15 +3240,22 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addReg(VAddr);
},
[=](MachineInstrBuilder &MIB) { // soffset
- MIB.addReg(SOffset);
+ // If we don't know this private access is a local stack object, it
+ // needs to be relative to the entry point's scratch wave offset.
+ // TODO: Should split large offsets that don't fit like above.
+ // TODO: Don't use scratch wave offset just because the offset
+ // didn't fit.
+ if (!Info->isEntryFunction() && FI.hasValue())
+ MIB.addReg(Info->getStackPtrOffsetReg());
+ else
+ MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
MIB.addImm(Offset);
}}};
}
-bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
- const MachineOperand &Base,
+bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
int64_t Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
@@ -2195,7 +3267,7 @@ bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
// On Southern Islands instruction with a negative base value and an offset
// don't seem to work.
- return KnownBits->signBitIsZero(Base.getReg());
+ return KnownBits->signBitIsZero(Base);
}
InstructionSelector::ComplexRendererFns
@@ -2214,68 +3286,485 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
const MachineMemOperand *MMO = *MI->memoperands_begin();
const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
- Register SOffsetReg = isStackPtrRelative(PtrInfo)
- ? Info->getStackPtrOffsetReg()
- : Info->getScratchWaveOffsetReg();
return {{
- [=](MachineInstrBuilder &MIB) {
+ [=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
- }, // rsrc
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (isStackPtrRelative(PtrInfo))
+ MIB.addReg(Info->getStackPtrOffsetReg());
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
}
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
+ const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+ if (!RootDef)
+ return std::make_pair(Root.getReg(), 0);
+
+ int64_t ConstAddr = 0;
+
+ Register PtrBase;
+ int64_t Offset;
+ std::tie(PtrBase, Offset) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+ if (Offset) {
+ if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+ // (add n0, c0)
+ return std::make_pair(PtrBase, Offset);
+ }
+ } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+ // TODO
+
+
+ } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+ // TODO
+
+ }
+
+ return std::make_pair(Root.getReg(), 0);
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
+ Register Reg;
+ unsigned Offset;
+ std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
+ Register Reg;
+ unsigned Offset;
+ std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
+ }};
+}
+
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
- if (!RootDef) {
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
- }};
- }
+ if (!RootDef)
+ return std::make_pair(Root.getReg(), 0);
int64_t ConstAddr = 0;
- if (isBaseWithConstantOffset(Root, *MRI)) {
- const MachineOperand &LHS = RootDef->getOperand(1);
- const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
- if (LHSDef && RHSDef) {
- int64_t PossibleOffset =
- RHSDef->getOperand(1).getCImm()->getSExtValue();
- if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) {
- // (add n0, c0)
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); }
- }};
- }
+
+ Register PtrBase;
+ int64_t Offset;
+ std::tie(PtrBase, Offset) =
+ getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+ if (Offset) {
+ int64_t DWordOffset0 = Offset / 4;
+ int64_t DWordOffset1 = DWordOffset0 + 1;
+ if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
+ // (add n0, c0)
+ return std::make_pair(PtrBase, DWordOffset0);
}
} else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+ // TODO
+ } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+ // TODO
+ }
- } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+ return std::make_pair(Root.getReg(), 0);
+}
+
+/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
+/// the base value with the constant offset. There may be intervening copies
+/// between \p Root and the identified constant. Returns \p Root, 0 if this does
+/// not match the pattern.
+std::pair<Register, int64_t>
+AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
+ Register Root, const MachineRegisterInfo &MRI) const {
+ MachineInstr *RootI = MRI.getVRegDef(Root);
+ if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
+ return {Root, 0};
+
+ MachineOperand &RHS = RootI->getOperand(2);
+ Optional<ValueAndVReg> MaybeOffset
+ = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
+ if (!MaybeOffset)
+ return {Root, 0};
+ return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
+}
+
+static void addZeroImm(MachineInstrBuilder &MIB) {
+ MIB.addImm(0);
+}
+
+/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
+/// BasePtr is not valid, a null base pointer will be used.
+static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ uint32_t FormatLo, uint32_t FormatHi,
+ Register BasePtr) {
+ Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
+
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(RSrc2)
+ .addImm(FormatLo);
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(RSrc3)
+ .addImm(FormatHi);
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ B.buildInstr(AMDGPU::REG_SEQUENCE)
+ .addDef(RSrcHi)
+ .addReg(RSrc2)
+ .addImm(AMDGPU::sub0)
+ .addReg(RSrc3)
+ .addImm(AMDGPU::sub1);
+
+ Register RSrcLo = BasePtr;
+ if (!BasePtr) {
+ RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B64)
+ .addDef(RSrcLo)
+ .addImm(0);
+ }
+
+ B.buildInstr(AMDGPU::REG_SEQUENCE)
+ .addDef(RSrc)
+ .addReg(RSrcLo)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(RSrcHi)
+ .addImm(AMDGPU::sub2_sub3);
+
+ return RSrc;
+}
+
+static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, Register BasePtr) {
+ uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
+
+ // FIXME: Why are half the "default" bits ignored based on the addressing
+ // mode?
+ return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
+}
+
+static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ const SIInstrInfo &TII, Register BasePtr) {
+ uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
+
+ // FIXME: Why are half the "default" bits ignored based on the addressing
+ // mode?
+ return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
+}
+
+AMDGPUInstructionSelector::MUBUFAddressData
+AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
+ MUBUFAddressData Data;
+ Data.N0 = Src;
+
+ Register PtrBase;
+ int64_t Offset;
+
+ std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
+ if (isUInt<32>(Offset)) {
+ Data.N0 = PtrBase;
+ Data.Offset = Offset;
+ }
+
+ if (MachineInstr *InputAdd
+ = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
+ Data.N2 = InputAdd->getOperand(1).getReg();
+ Data.N3 = InputAdd->getOperand(2).getReg();
+
+ // FIXME: Need to fix extra SGPR->VGPRcopies inserted
+ // FIXME: Don't know this was defined by operand 0
+ //
+ // TODO: Remove this when we have copy folding optimizations after
+ // RegBankSelect.
+ Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
+ Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
+ }
+
+ return Data;
+}
+
+/// Return if the addr64 mubuf mode should be used for the given address.
+bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
+ // (ptr_add N2, N3) -> addr64, or
+ // (ptr_add (ptr_add N2, N3), C1) -> addr64
+ if (Addr.N2)
+ return true;
+
+ const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
+ return N0Bank->getID() == AMDGPU::VGPRRegBankID;
+}
+/// Split an immediate offset \p ImmOffset depending on whether it fits in the
+/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
+/// component.
+void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
+ MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
+ return;
+
+ // Illegal offset, store it in soffset.
+ SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(SOffset)
+ .addImm(ImmOffset);
+ ImmOffset = 0;
+}
+bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
+ MachineOperand &Root, Register &VAddr, Register &RSrcReg,
+ Register &SOffset, int64_t &Offset) const {
+ // FIXME: Predicates should stop this from reaching here.
+ // addr64 bit was removed for volcanic islands.
+ if (!STI.hasAddr64() || STI.useFlatForGlobal())
+ return false;
+
+ MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
+ if (!shouldUseAddr64(AddrData))
+ return false;
+
+ Register N0 = AddrData.N0;
+ Register N2 = AddrData.N2;
+ Register N3 = AddrData.N3;
+ Offset = AddrData.Offset;
+
+ // Base pointer for the SRD.
+ Register SRDPtr;
+
+ if (N2) {
+ if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+ assert(N3);
+ if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+ // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
+ // addr64, and construct the default resource from a 0 address.
+ VAddr = N0;
+ } else {
+ SRDPtr = N3;
+ VAddr = N2;
+ }
+ } else {
+ // N2 is not divergent.
+ SRDPtr = N2;
+ VAddr = N3;
+ }
+ } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+ // Use the default null pointer in the resource
+ VAddr = N0;
+ } else {
+ // N0 -> offset, or
+ // (N0 + C1) -> offset
+ SRDPtr = N0;
}
+ MachineIRBuilder B(*Root.getParent());
+ RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
+ splitIllegalMUBUFOffset(B, SOffset, Offset);
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
+ MachineOperand &Root, Register &RSrcReg, Register &SOffset,
+ int64_t &Offset) const {
+ MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
+ if (shouldUseAddr64(AddrData))
+ return false;
+
+ // N0 -> offset, or
+ // (N0 + C1) -> offset
+ Register SRDPtr = AddrData.N0;
+ Offset = AddrData.Offset;
+
+ // TODO: Look through extensions for 32-bit soffset.
+ MachineIRBuilder B(*Root.getParent());
+
+ RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
+ splitIllegalMUBUFOffset(B, SOffset, Offset);
+ return true;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
+ Register VAddr;
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
+ return {};
+
+ // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+ // pattern.
return {{
- [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ MIB.addReg(VAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset);
+ },
+ addZeroImm, // glc
+ addZeroImm, // slc
+ addZeroImm, // tfe
+ addZeroImm, // dlc
+ addZeroImm // swz
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
+ addZeroImm, // glc
+ addZeroImm, // slc
+ addZeroImm, // tfe
+ addZeroImm, // dlc
+ addZeroImm // swz
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
+ Register VAddr;
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
+ return {};
+
+ // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+ // pattern.
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // vaddr
+ MIB.addReg(VAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(Offset);
+ },
+ addZeroImm // slc
}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
+ Register RSrcReg;
+ Register SOffset;
+ int64_t Offset = 0;
+
+ if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(RSrcReg);
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ if (SOffset)
+ MIB.addReg(SOffset);
+ else
+ MIB.addImm(0);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
+ addZeroImm // slc
+ }};
+}
+
+/// Get an immediate that must be 32-bits, and treated as zero extended.
+static Optional<uint64_t> getConstantZext32Val(Register Reg,
+ const MachineRegisterInfo &MRI) {
+ // getConstantVRegVal sexts any values, so see if that matters.
+ Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
+ if (!OffsetVal || !isInt<32>(*OffsetVal))
+ return None;
+ return Lo_32(*OffsetVal);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
+ Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+ if (!OffsetVal)
+ return {};
+
+ Optional<int64_t> EncodedImm =
+ AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
+ if (!EncodedImm)
+ return {};
+
+ return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
+ assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
+
+ Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+ if (!OffsetVal)
+ return {};
+
+ Optional<int64_t> EncodedImm
+ = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
+ if (!EncodedImm)
+ return {};
+
+ return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
+}
+
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
- Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI);
- assert(CstVal && "Expected constant value");
- MIB.addImm(CstVal.getValue());
+ MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
}
void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
@@ -2316,6 +3805,34 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
MIB.addImm(MI.getOperand(OpIdx).getImm());
}
+void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
+}
+
bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 38ca7fd4104bb..1fe80958917d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -31,6 +31,10 @@ namespace {
namespace llvm {
+namespace AMDGPU {
+struct ImageDimIntrinsicInfo;
+}
+
class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
class GCNSubtarget;
@@ -80,28 +84,39 @@ private:
MachineOperand getSubOperand64(MachineOperand &MO,
const TargetRegisterClass &SubRC,
unsigned SubIdx) const;
+
+ bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
bool selectCOPY(MachineInstr &I) const;
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
+ bool selectG_FNEG(MachineInstr &I) const;
+ bool selectG_FABS(MachineInstr &I) const;
bool selectG_AND_OR_XOR(MachineInstr &I) const;
bool selectG_ADD_SUB(MachineInstr &I) const;
bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
bool selectG_EXTRACT(MachineInstr &I) const;
bool selectG_MERGE_VALUES(MachineInstr &I) const;
bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
+ bool selectG_BUILD_VECTOR_TRUNC(MachineInstr &I) const;
bool selectG_PTR_ADD(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
bool selectG_INSERT(MachineInstr &I) const;
- bool selectG_INTRINSIC(MachineInstr &I) const;
- std::tuple<Register, unsigned, unsigned>
- splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+ bool selectInterpP1F16(MachineInstr &MI) const;
+ bool selectDivScale(MachineInstr &MI) const;
+ bool selectIntrinsicIcmp(MachineInstr &MI) const;
+ bool selectBallot(MachineInstr &I) const;
+ bool selectG_INTRINSIC(MachineInstr &I) const;
- bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
+ bool selectEndCfIntrinsic(MachineInstr &MI) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
+ bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
+ bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+ bool selectImageIntrinsic(MachineInstr &MI,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
bool selectG_ICMP(MachineInstr &I) const;
@@ -112,15 +127,18 @@ private:
void initM0(MachineInstr &I) const;
bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
+ bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
bool selectG_SELECT(MachineInstr &I) const;
bool selectG_BRCOND(MachineInstr &I) const;
- bool selectG_FRAME_INDEX(MachineInstr &I) const;
- bool selectG_PTR_MASK(MachineInstr &I) const;
+ bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const;
+ bool selectG_PTRMASK(MachineInstr &I) const;
bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
+ bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
+ bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
std::pair<Register, unsigned>
- selectVOP3ModsImpl(Register Src) const;
+ selectVOP3ModsImpl(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -134,11 +152,18 @@ private:
selectVOP3OMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
+
+ ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
+
InstructionSelector::ComplexRendererFns
selectVOP3Mods_nnan(MachineOperand &Root) const;
+ std::pair<Register, unsigned>
+ selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+
InstructionSelector::ComplexRendererFns
- selectVOP3OpSelMods0(MachineOperand &Root) const;
+ selectVOP3PMods(MachineOperand &Root) const;
+
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
@@ -163,19 +188,86 @@ private:
InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand &Root) const;
- bool isDSOffsetLegal(const MachineRegisterInfo &MRI,
- const MachineOperand &Base,
- int64_t Offset, unsigned OffsetBits) const;
+ bool isDSOffsetLegal(Register Base, int64_t Offset,
+ unsigned OffsetBits) const;
+ std::pair<Register, unsigned>
+ selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectDS1Addr1Offset(MachineOperand &Root) const;
+ std::pair<Register, unsigned>
+ selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectDS64Bit4ByteAligned(MachineOperand &Root) const;
+
+ std::pair<Register, int64_t>
+ getPtrBaseWithConstantOffset(Register Root,
+ const MachineRegisterInfo &MRI) const;
+
+ // Parse out a chain of up to two g_ptr_add instructions.
+ // g_ptr_add (n0, _)
+ // g_ptr_add (n0, (n1 = g_ptr_add n2, n3))
+ struct MUBUFAddressData {
+ Register N0, N2, N3;
+ int64_t Offset = 0;
+ };
+
+ bool shouldUseAddr64(MUBUFAddressData AddrData) const;
+
+ void splitIllegalMUBUFOffset(MachineIRBuilder &B,
+ Register &SOffset, int64_t &ImmOffset) const;
+
+ MUBUFAddressData parseMUBUFAddress(Register Src) const;
+
+ bool selectMUBUFAddr64Impl(MachineOperand &Root, Register &VAddr,
+ Register &RSrcReg, Register &SOffset,
+ int64_t &Offset) const;
+
+ bool selectMUBUFOffsetImpl(MachineOperand &Root, Register &RSrcReg,
+ Register &SOffset, int64_t &Offset) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFAddr64(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFOffset(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFOffsetAtomic(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectMUBUFAddr64Atomic(MachineOperand &Root) const;
+
+ ComplexRendererFns selectSMRDBufferImm(MachineOperand &Root) const;
+ ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const;
+
void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx = -1) const;
void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const {
+ renderTruncTImm(MIB, MI, OpIdx);
+ }
+
+ void renderTruncTImm8(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const {
+ renderTruncTImm(MIB, MI, OpIdx);
+ }
+
+ void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const {
+ renderTruncTImm(MIB, MI, OpIdx);
+ }
+
+ void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const {
+ renderTruncTImm(MIB, MI, OpIdx);
+ }
+
void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
@@ -184,6 +276,14 @@ private:
void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+ void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+ void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+ void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
bool isInlineImmediate16(int64_t Imm) const;
bool isInlineImmediate32(int64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7e71dbdd12408..5cb7ac320d2fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -77,6 +77,9 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
def TruePredicate : Predicate<"">;
+// FIXME: Tablegen should specially supports this
+def FalsePredicate : Predicate<"false">;
+
// Add a predicate to the list if does not already exist to deduplicate it.
class PredConcat<list<Predicate> lst, Predicate pred> {
list<Predicate> ret =
@@ -101,12 +104,12 @@ class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
PredicateControl;
let RecomputePerFunction = 1 in {
-def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
-def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
-def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
-def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
-def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
-def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">;
+def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">;
+def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
}
@@ -408,7 +411,12 @@ def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i64;
}
+} // End let AddressSpaces
+} // End foreach as
+
+foreach as = [ "global", "flat", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
def store_#as : PatFrag<(ops node:$val, node:$ptr),
(unindexedstore node:$val, node:$ptr)> {
let IsStore = 1;
@@ -444,8 +452,8 @@ def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
defm atomic_store_#as : binary_atomic_op<atomic_store>;
-} // End let AddressSpaces = ...
-} // End foreach AddrSpace
+} // End let AddressSpaces
+} // End foreach as
multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
@@ -520,7 +528,7 @@ class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
-int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
+int FP_4294966784 = 0x4f7ffffe; // 4294966784 = 4294967296 - 512 = 2^32 - 2^9
int FP16_ONE = 0x3C00;
int FP16_NEG_ONE = 0xBC00;
int FP32_ONE = 0x3f800000;
@@ -731,6 +739,12 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
>;
}
+// fshr pattern
+class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
+ (fshr i32:$src0, i32:$src1, i32:$src2),
+ (BIT_ALIGN $src0, $src1, $src2)
+>;
+
// rotr pattern
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(rotr i32:$src0, i32:$src1),
@@ -796,3 +810,13 @@ def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
[(fmaxnum_ieee_oneuse node:$src0, node:$src1),
(fmaxnum_oneuse node:$src0, node:$src1)]
>;
+
+def any_fmad : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(fmad node:$src0, node:$src1, node:$src2),
+ (AMDGPUfmad_ftz node:$src0, node:$src1, node:$src2)]
+>;
+
+// FIXME: fsqrt should not select directly
+def any_amdgcn_sqrt : PatFrags<(ops node:$src0),
+ [(fsqrt node:$src0), (int_amdgcn_sqrt node:$src0)]
+>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3f99d5cfb7f9a..2976794b49c3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -11,19 +11,16 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
-#if defined(_MSC_VER) || defined(__MINGW32__)
-// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
-// from the Visual C++ cmath / math.h headers:
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
-#define _USE_MATH_DEFINES
-#endif
+#include "AMDGPULegalizerInfo.h"
#include "AMDGPU.h"
-#include "AMDGPULegalizerInfo.h"
+#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -37,21 +34,30 @@ using namespace llvm;
using namespace LegalizeActions;
using namespace LegalizeMutations;
using namespace LegalityPredicates;
-
-
-static LegalityPredicate isMultiple32(unsigned TypeIdx,
- unsigned MaxSize = 1024) {
- return [=](const LegalityQuery &Query) {
- const LLT Ty = Query.Types[TypeIdx];
- const LLT EltTy = Ty.getScalarType();
- return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
- };
+using namespace MIPatternMatch;
+
+// Hack until load/store selection patterns support any tuple of legal types.
+static cl::opt<bool> EnableNewLegality(
+ "amdgpu-global-isel-new-legality",
+ cl::desc("Use GlobalISel desired legality, rather than try to use"
+ "rules compatible with selection patterns"),
+ cl::init(false),
+ cl::ReallyHidden);
+
+static constexpr unsigned MaxRegisterSize = 1024;
+
+// Round the number of elements to the next power of two elements
+static LLT getPow2VectorType(LLT Ty) {
+ unsigned NElts = Ty.getNumElements();
+ unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
+ return Ty.changeNumElements(Pow2NElts);
}
-static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
- return [=](const LegalityQuery &Query) {
- return Query.Types[TypeIdx].getSizeInBits() == Size;
- };
+// Round the number of bits to the next power of two bits
+static LLT getPow2ScalarType(LLT Ty) {
+ unsigned Bits = Ty.getSizeInBits();
+ unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
+ return LLT::scalar(Pow2Bits);
}
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
@@ -109,6 +115,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ unsigned Size = Ty.getSizeInBits();
+
+ LLT CoercedTy;
+ if (Size <= 32) {
+ // <2 x s8> -> s16
+ // <4 x s8> -> s32
+ CoercedTy = LLT::scalar(Size);
+ } else
+ CoercedTy = LLT::scalarOrVector(Size / 32, 32);
+
+ return std::make_pair(TypeIdx, CoercedTy);
+ };
+}
+
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
@@ -130,25 +153,47 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
};
}
-// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
-// v2s16.
+static bool isRegisterSize(unsigned Size) {
+ return Size % 32 == 0 && Size <= MaxRegisterSize;
+}
+
+static bool isRegisterVectorElementType(LLT EltTy) {
+ const int EltSize = EltTy.getSizeInBits();
+ return EltSize == 16 || EltSize % 32 == 0;
+}
+
+static bool isRegisterVectorType(LLT Ty) {
+ const int EltSize = Ty.getElementType().getSizeInBits();
+ return EltSize == 32 || EltSize == 64 ||
+ (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
+ EltSize == 128 || EltSize == 256;
+}
+
+static bool isRegisterType(LLT Ty) {
+ if (!isRegisterSize(Ty.getSizeInBits()))
+ return false;
+
+ if (Ty.isVector())
+ return isRegisterVectorType(Ty);
+
+ return true;
+}
+
+// Any combination of 32 or 64-bit elements up the maximum register size, and
+// multiples of v2s16.
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
- const LLT Ty = Query.Types[TypeIdx];
- if (Ty.isVector()) {
- const int EltSize = Ty.getElementType().getSizeInBits();
- return EltSize == 32 || EltSize == 64 ||
- (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
- EltSize == 128 || EltSize == 256;
- }
-
- return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
+ return isRegisterType(Query.Types[TypeIdx]);
};
}
-static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
+static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
- return Query.Types[TypeIdx].getElementType() == Type;
+ const LLT QueryTy = Query.Types[TypeIdx];
+ if (!QueryTy.isVector())
+ return false;
+ const LLT EltTy = QueryTy.getElementType();
+ return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
};
}
@@ -160,6 +205,120 @@ static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
};
}
+// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
+// handle some operations by just promoting the register during
+// selection. There are also d16 loads on GFX9+ which preserve the high bits.
+static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
+ bool IsLoad) {
+ switch (AS) {
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ // FIXME: Private element size.
+ return 32;
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return ST.useDS128() ? 128 : 64;
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ // Treat constant and global as identical. SMRD loads are sometimes usable for
+ // global loads (ideally constant address space should be eliminated)
+ // depending on the context. Legality cannot be context dependent, but
+ // RegBankSelect can split the load as necessary depending on the pointer
+ // register bank/uniformity and if the memory is invariant or not written in a
+ // kernel.
+ return IsLoad ? 512 : 128;
+ default:
+ // Flat addresses may contextually need to be split to 32-bit parts if they
+ // may alias scratch depending on the subtarget.
+ return 128;
+ }
+}
+
+static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
+ const LegalityQuery &Query,
+ unsigned Opcode) {
+ const LLT Ty = Query.Types[0];
+
+ // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
+ const bool IsLoad = Opcode != AMDGPU::G_STORE;
+
+ unsigned RegSize = Ty.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned AS = Query.Types[1].getAddressSpace();
+
+ // All of these need to be custom lowered to cast the pointer operand.
+ if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return false;
+
+ // TODO: We should be able to widen loads if the alignment is high enough, but
+ // we also need to modify the memory access size.
+#if 0
+ // Accept widening loads based on alignment.
+ if (IsLoad && MemSize < Size)
+ MemSize = std::max(MemSize, Align);
+#endif
+
+ // Only 1-byte and 2-byte to 32-bit extloads are valid.
+ if (MemSize != RegSize && RegSize != 32)
+ return false;
+
+ if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
+ return false;
+
+ switch (MemSize) {
+ case 8:
+ case 16:
+ case 32:
+ case 64:
+ case 128:
+ break;
+ case 96:
+ if (!ST.hasDwordx3LoadStores())
+ return false;
+ break;
+ case 256:
+ case 512:
+ // These may contextually need to be broken down.
+ break;
+ default:
+ return false;
+ }
+
+ assert(RegSize >= MemSize);
+
+ if (Align < MemSize) {
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
+ return false;
+ }
+
+ return true;
+}
+
+// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
+// workaround this. Eventually it should ignore the type for loads and only care
+// about the size. Return true in cases where we will workaround this for now by
+// bitcasting.
+static bool loadStoreBitcastWorkaround(const LLT Ty) {
+ if (EnableNewLegality)
+ return false;
+
+ const unsigned Size = Ty.getSizeInBits();
+ if (Size <= 64)
+ return false;
+ if (!Ty.isVector())
+ return true;
+ unsigned EltSize = Ty.getElementType().getSizeInBits();
+ return EltSize != 32 && EltSize != 64;
+}
+
+static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
+ unsigned Opcode) {
+ const LLT Ty = Query.Types[0];
+ return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
+ !loadStoreBitcastWorkaround(Ty);
+}
+
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
@@ -170,14 +329,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
const LLT S1 = LLT::scalar(1);
- const LLT S8 = LLT::scalar(8);
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
- const LLT S96 = LLT::scalar(96);
const LLT S128 = LLT::scalar(128);
const LLT S256 = LLT::scalar(256);
- const LLT S1024 = LLT::scalar(1024);
+ const LLT S512 = LLT::scalar(512);
+ const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
@@ -244,6 +402,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
S32, S64, S16, V2S16
};
+ const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
+
setAction({G_BRCOND, S1}, Legal); // VCC branches
setAction({G_BRCOND, S32}, Legal); // SCC branches
@@ -261,11 +421,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.legalIf(isPointer(0));
- if (ST.has16BitInsts()) {
+ if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+ .legalFor({S32, S16, V2S16})
+ .clampScalar(0, S16, S32)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32);
+ } else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16})
.clampScalar(0, S16, S32)
- .scalarize(0);
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32);
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32})
@@ -275,7 +443,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Not really legal. Placeholder for custom lowering.
getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
- .legalFor({S32, S64})
+ .customFor({S32, S64})
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0, 32)
.scalarize(0);
@@ -298,35 +466,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder({G_UADDO, G_USUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
.legalFor({{S32, S1}, {S32, S32}})
- .clampScalar(0, S32, S32)
- .scalarize(0); // TODO: Implement.
-
- getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
+ .minScalar(0, S32)
+ // TODO: .scalarize(0)
.lower();
getActionDefinitionsBuilder(G_BITCAST)
// Don't worry about the size constraint.
.legalIf(all(isRegisterType(0), isRegisterType(1)))
- // FIXME: Testing hack
- .legalForCartesianProduct({S16, LLT::vector(2, 8), });
-
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64, S16})
- .clampScalar(0, S16, S64);
-
- getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
- ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
- .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampScalarOrElt(0, S32, S1024)
- .legalIf(isMultiple32(0))
- .widenScalarToNextPow2(0, 32)
- .clampMaxNumElements(0, S32, 16);
+ .lower();
- // FIXME: i1 operands to intrinsics should always be legal, but other i1
- // values may not be legal. We need to figure out how to distinguish
- // between these two scenarios.
getActionDefinitionsBuilder(G_CONSTANT)
.legalFor({S1, S32, S64, S16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
@@ -334,10 +483,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0)
.legalIf(isPointer(0));
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64, S16})
+ .clampScalar(0, S16, S64);
+
+ getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
+ .legalIf(isRegisterType(0))
+ // s1 and s16 are special cases because they have legal operations on
+ // them, but don't really occupy registers in the normal way.
+ .legalFor({S1, S16})
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampScalarOrElt(0, S32, MaxScalar)
+ .widenScalarToNextPow2(0, 32)
+ .clampMaxNumElements(0, S32, 16);
+
setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
- getActionDefinitionsBuilder(G_GLOBAL_VALUE)
- .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
+ // If the amount is divergent, we have to do a wave reduction to get the
+ // maximum value, so this is expanded during RegBankSelect.
+ getActionDefinitionsBuilder(G_DYN_STACKALLOC)
+ .legalFor({{PrivatePtr, S32}});
+
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+ .unsupportedFor({PrivatePtr})
+ .custom();
+ setAction({G_BLOCK_ADDR, CodePtr}, Legal);
auto &FPOpActions = getActionDefinitionsBuilder(
{ G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
@@ -397,33 +567,41 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.clampScalar(0, S16, S64);
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
-
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
.legalFor({S32, S64, S16})
.scalarize(0)
.clampScalar(0, S16, S64);
} else {
- getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
+ getActionDefinitionsBuilder(G_FSQRT)
.legalFor({S32, S64})
.scalarize(0)
.clampScalar(0, S32, S64);
+
+ if (ST.hasFractBug()) {
+ getActionDefinitionsBuilder(G_FFLOOR)
+ .customFor({S64})
+ .legalFor({S32, S64})
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
+ } else {
+ getActionDefinitionsBuilder(G_FFLOOR)
+ .legalFor({S32, S64})
+ .scalarize(0)
+ .clampScalar(0, S32, S64);
+ }
}
getActionDefinitionsBuilder(G_FPTRUNC)
.legalFor({{S32, S64}, {S16, S32}})
- .scalarize(0);
+ .scalarize(0)
+ .lower();
getActionDefinitionsBuilder(G_FPEXT)
.legalFor({{S64, S32}, {S32, S16}})
.lowerFor({{S64, S16}}) // FIXME: Implement
.scalarize(0);
- // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
- getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
-
getActionDefinitionsBuilder(G_FSUB)
// Use actual fsub instruction
.legalFor({S32})
@@ -434,22 +612,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Whether this is legal depends on the floating point mode for the function.
auto &FMad = getActionDefinitionsBuilder(G_FMAD);
- if (ST.hasMadF16())
+ if (ST.hasMadF16() && ST.hasMadMacF32Insts())
FMad.customFor({S32, S16});
- else
+ else if (ST.hasMadMacF32Insts())
FMad.customFor({S32});
+ else if (ST.hasMadF16())
+ FMad.customFor({S16});
FMad.scalarize(0)
.lower();
+ // TODO: Do we need to clamp maximum bitwidth?
+ getActionDefinitionsBuilder(G_TRUNC)
+ .legalIf(isScalar(0))
+ .legalFor({{V2S16, V2S32}})
+ .clampMaxNumElements(0, S16, 2)
+ // Avoid scalarizing in cases that should be truly illegal. In unresolvable
+ // situations (like an invalid implicit use), we don't want to infinite loop
+ // in the legalizer.
+ .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
+ .alwaysLegal();
+
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
.legalFor({{S64, S32}, {S32, S16}, {S64, S16},
- {S32, S1}, {S64, S1}, {S16, S1},
- {S96, S32},
- // FIXME: Hack
- {S64, LLT::scalar(33)},
- {S32, S8}, {S32, LLT::scalar(24)}})
+ {S32, S1}, {S64, S1}, {S16, S1}})
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .clampScalar(0, S32, S64)
+ .widenScalarToNextPow2(1, 32);
// TODO: Split s1->s64 during regbankselect for VALU.
auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
@@ -460,17 +648,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.has16BitInsts())
IToFP.legalFor({{S16, S16}});
IToFP.clampScalar(1, S32, S64)
- .scalarize(0);
+ .scalarize(0)
+ .widenScalarToNextPow2(1);
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
+ .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
+ .customFor({{S64, S64}});
if (ST.has16BitInsts())
FPToI.legalFor({{S16, S16}});
else
FPToI.minScalar(1, S32);
FPToI.minScalar(0, S32)
- .scalarize(0);
+ .scalarize(0)
+ .lower();
getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
.scalarize(0)
@@ -494,16 +685,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
+ // FIXME: Clamp offset operand.
getActionDefinitionsBuilder(G_PTR_ADD)
- .legalForCartesianProduct(AddrSpaces64, {S64})
- .legalForCartesianProduct(AddrSpaces32, {S32})
+ .legalIf(isPointer(0))
.scalarize(0);
- getActionDefinitionsBuilder(G_PTR_MASK)
- .scalarize(0)
- .alwaysLegal();
-
- setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+ getActionDefinitionsBuilder(G_PTRMASK)
+ .legalIf(typeInSet(1, {S64, S32}))
+ .minScalar(1, S32)
+ .maxScalarIf(sizeIs(0, 32), 1, S32)
+ .maxScalarIf(sizeIs(0, 64), 1, S64)
+ .scalarize(0);
auto &CmpBuilder =
getActionDefinitionsBuilder(G_ICMP)
@@ -537,16 +729,45 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(1, S32, S64)
.scalarize(0);
- // FIXME: fexp, flog2, flog10 needs to be custom lowered.
- getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
- G_FLOG, G_FLOG2, G_FLOG10})
- .legalFor({S32})
- .scalarize(0);
+ // FIXME: fpow has a selection pattern that should move to custom lowering.
+ auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
+ if (ST.has16BitInsts())
+ Exp2Ops.legalFor({S32, S16});
+ else
+ Exp2Ops.legalFor({S32});
+ Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
+ Exp2Ops.scalarize(0);
+
+ auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
+ if (ST.has16BitInsts())
+ ExpOps.customFor({{S32}, {S16}});
+ else
+ ExpOps.customFor({S32});
+ ExpOps.clampScalar(0, MinScalarFPTy, S32)
+ .scalarize(0);
+
+ // The 64-bit versions produce 32-bit results, but only on the SALU.
+ getActionDefinitionsBuilder(G_CTPOP)
+ .legalFor({{S32, S32}, {S32, S64}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
+
+ // The hardware instructions return a different result on 0 than the generic
+ // instructions expect. The hardware produces -1, but these produce the
+ // bitwidth.
+ getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
+ .scalarize(0)
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32)
+ .lower();
// The 64-bit versions produce 32-bit results, but only on the SALU.
- getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
- G_CTTZ, G_CTTZ_ZERO_UNDEF,
- G_CTPOP})
+ getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
@@ -554,50 +775,58 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
- // TODO: Expand for > s32
- getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
+ getActionDefinitionsBuilder(G_BITREVERSE)
.legalFor({S32})
.clampScalar(0, S32, S32)
.scalarize(0);
if (ST.has16BitInsts()) {
+ getActionDefinitionsBuilder(G_BSWAP)
+ .legalFor({S16, S32, V2S16})
+ .clampMaxNumElements(0, S16, 2)
+ // FIXME: Fixing non-power-of-2 before clamp is workaround for
+ // narrowScalar limitation.
+ .widenScalarToNextPow2(0)
+ .clampScalar(0, S16, S32)
+ .scalarize(0);
+
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
.legalFor({S32, S16, V2S16})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
- .clampScalar(0, S16, S32)
+ .minScalar(0, S16)
.widenScalarToNextPow2(0)
- .scalarize(0);
+ .scalarize(0)
+ .lower();
} else {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
.legalFor({S32, S16})
.widenScalarToNextPow2(0)
- .clampScalar(0, S16, S32)
- .scalarize(0);
+ .minScalar(0, S16)
+ .scalarize(0)
+ .lower();
}
} else {
+ // TODO: Should have same legality without v_perm_b32
+ getActionDefinitionsBuilder(G_BSWAP)
+ .legalFor({S32})
+ .lowerIf(scalarNarrowerThan(0, 32))
+ // FIXME: Fixing non-power-of-2 before clamp is workaround for
+ // narrowScalar limitation.
+ .widenScalarToNextPow2(0)
+ .maxScalar(0, S32)
+ .scalarize(0)
+ .lower();
+
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
.legalFor({S32})
- .clampScalar(0, S32, S32)
+ .minScalar(0, S32)
.widenScalarToNextPow2(0)
- .scalarize(0);
+ .scalarize(0)
+ .lower();
}
- auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
- return [=](const LegalityQuery &Query) {
- return Query.Types[TypeIdx0].getSizeInBits() <
- Query.Types[TypeIdx1].getSizeInBits();
- };
- };
-
- auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
- return [=](const LegalityQuery &Query) {
- return Query.Types[TypeIdx0].getSizeInBits() >
- Query.Types[TypeIdx1].getSizeInBits();
- };
- };
-
getActionDefinitionsBuilder(G_INTTOPTR)
// List the common cases
.legalForCartesianProduct(AddrSpaces64, {S64})
@@ -609,7 +838,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
[](const LegalityQuery &Query) {
return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
})
- .narrowScalarIf(greaterThan(1, 0),
+ .narrowScalarIf(largerThan(1, 0),
[](const LegalityQuery &Query) {
return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
});
@@ -626,7 +855,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
})
.narrowScalarIf(
- greaterThan(0, 1),
+ largerThan(0, 1),
[](const LegalityQuery &Query) {
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
});
@@ -635,33 +864,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.custom();
- // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
- // handle some operations by just promoting the register during
- // selection. There are also d16 loads on GFX9+ which preserve the high bits.
- auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
- switch (AS) {
- // FIXME: Private element size.
- case AMDGPUAS::PRIVATE_ADDRESS:
- return 32;
- // FIXME: Check subtarget
- case AMDGPUAS::LOCAL_ADDRESS:
- return ST.useDS128() ? 128 : 64;
-
- // Treat constant and global as identical. SMRD loads are sometimes usable
- // for global loads (ideally constant address space should be eliminated)
- // depending on the context. Legality cannot be context dependent, but
- // RegBankSelect can split the load as necessary depending on the pointer
- // register bank/uniformity and if the memory is invariant or not written in
- // a kernel.
- case AMDGPUAS::CONSTANT_ADDRESS:
- case AMDGPUAS::GLOBAL_ADDRESS:
- return 512;
- default:
- return 128;
- }
- };
-
- const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
+ const auto needToSplitMemOp = [=](const LegalityQuery &Query,
+ bool IsLoad) -> bool {
const LLT DstTy = Query.Types[0];
// Split vector extloads.
@@ -676,14 +880,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT PtrTy = Query.Types[1];
unsigned AS = PtrTy.getAddressSpace();
- if (MemSize > maxSizeForAddrSpace(AS))
+ if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
return true;
// Catch weird sized loads that don't evenly divide into the access sizes
// TODO: May be able to widen depending on alignment etc.
- unsigned NumRegs = MemSize / 32;
- if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
- return true;
+ unsigned NumRegs = (MemSize + 31) / 32;
+ if (NumRegs == 3) {
+ if (!ST.hasDwordx3LoadStores())
+ return true;
+ } else {
+ // If the alignment allows, these should have been widened.
+ if (!isPowerOf2_32(NumRegs))
+ return true;
+ }
if (Align < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
@@ -693,6 +903,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return false;
};
+ const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
+ unsigned Opc) -> bool {
+ unsigned Size = Query.Types[0].getSizeInBits();
+ if (isPowerOf2_32(Size))
+ return false;
+
+ if (Size == 96 && ST.hasDwordx3LoadStores())
+ return false;
+
+ unsigned AddrSpace = Query.Types[1].getAddressSpace();
+ if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
+ return false;
+
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned RoundedSize = NextPowerOf2(Size);
+ return (Align >= RoundedSize);
+ };
+
unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
@@ -705,17 +933,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const bool IsStore = Op == G_STORE;
auto &Actions = getActionDefinitionsBuilder(Op);
- // Whitelist the common cases.
- // TODO: Pointer loads
- // TODO: Wide constant loads
- // TODO: Only CI+ has 3x loads
- // TODO: Loads to s16 on gfx9
+ // Explicitly list some common cases.
+ // TODO: Does this help compile time at all?
Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
{V2S32, GlobalPtr, 64, GlobalAlign32},
- {V3S32, GlobalPtr, 96, GlobalAlign32},
- {S96, GlobalPtr, 96, GlobalAlign32},
{V4S32, GlobalPtr, 128, GlobalAlign32},
- {S128, GlobalPtr, 128, GlobalAlign32},
{S64, GlobalPtr, 64, GlobalAlign32},
{V2S64, GlobalPtr, 128, GlobalAlign32},
{V2S16, GlobalPtr, 32, GlobalAlign32},
@@ -734,23 +956,60 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
{S32, PrivatePtr, 16, 16},
{V2S16, PrivatePtr, 32, 32},
- {S32, FlatPtr, 32, GlobalAlign32},
- {S32, FlatPtr, 16, GlobalAlign16},
- {S32, FlatPtr, 8, GlobalAlign8},
- {V2S16, FlatPtr, 32, GlobalAlign32},
-
{S32, ConstantPtr, 32, GlobalAlign32},
{V2S32, ConstantPtr, 64, GlobalAlign32},
- {V3S32, ConstantPtr, 96, GlobalAlign32},
{V4S32, ConstantPtr, 128, GlobalAlign32},
{S64, ConstantPtr, 64, GlobalAlign32},
- {S128, ConstantPtr, 128, GlobalAlign32},
{V2S32, ConstantPtr, 32, GlobalAlign32}});
+ Actions.legalIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return isLoadStoreLegal(ST, Query, Op);
+ });
+
+ // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
+ // 64-bits.
+ //
+ // TODO: Should generalize bitcast action into coerce, which will also cover
+ // inserting addrspacecasts.
+ Actions.customIf(typeIs(1, Constant32Ptr));
+
+ // Turn any illegal element vectors into something easier to deal
+ // with. These will ultimately produce 32-bit scalar shifts to extract the
+ // parts anyway.
+ //
+ // For odd 16-bit element vectors, prefer to split those into pieces with
+ // 16-bit vector parts.
+ Actions.bitcastIf(
+ [=](const LegalityQuery &Query) -> bool {
+ const LLT Ty = Query.Types[0];
+ const unsigned Size = Ty.getSizeInBits();
+
+ if (Size != Query.MMODescrs[0].SizeInBits)
+ return Size <= 32 && Ty.isVector();
+
+ if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+ return true;
+ return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+ !isRegisterVectorElementType(Ty.getElementType());
+ }, bitcastToRegisterType(0));
+
Actions
.customIf(typeIs(1, Constant32Ptr))
+ // Widen suitably aligned loads by loading extra elements.
+ .moreElementsIf([=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[0];
+ return Op == G_LOAD && Ty.isVector() &&
+ shouldWidenLoadResult(Query, Op);
+ }, moreElementsToNextPow2(0))
+ .widenScalarIf([=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[0];
+ return Op == G_LOAD && !Ty.isVector() &&
+ shouldWidenLoadResult(Query, Op);
+ }, widenScalarOrEltToNextPow2(0))
.narrowScalarIf(
[=](const LegalityQuery &Query) -> bool {
- return !Query.Types[0].isVector() && needToSplitLoad(Query);
+ return !Query.Types[0].isVector() &&
+ needToSplitMemOp(Query, Op == G_LOAD);
},
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
const LLT DstTy = Query.Types[0];
@@ -763,13 +1022,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (DstSize > MemSize)
return std::make_pair(0, LLT::scalar(MemSize));
+ if (!isPowerOf2_32(DstSize)) {
+ // We're probably decomposing an odd sized store. Try to split
+ // to the widest type. TODO: Account for alignment. As-is it
+ // should be OK, since the new parts will be further legalized.
+ unsigned FloorSize = PowerOf2Floor(DstSize);
+ return std::make_pair(0, LLT::scalar(FloorSize));
+ }
+
if (DstSize > 32 && (DstSize % 32 != 0)) {
// FIXME: Need a way to specify non-extload of larger size if
// suitably aligned.
return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
}
- unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+ unsigned MaxSize = maxSizeForAddrSpace(ST,
+ PtrTy.getAddressSpace(),
+ Op == G_LOAD);
if (MemSize > MaxSize)
return std::make_pair(0, LLT::scalar(MaxSize));
@@ -778,18 +1047,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
})
.fewerElementsIf(
[=](const LegalityQuery &Query) -> bool {
- return Query.Types[0].isVector() && needToSplitLoad(Query);
+ return Query.Types[0].isVector() &&
+ needToSplitMemOp(Query, Op == G_LOAD);
},
[=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
const LLT DstTy = Query.Types[0];
const LLT PtrTy = Query.Types[1];
LLT EltTy = DstTy.getElementType();
- unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+ unsigned MaxSize = maxSizeForAddrSpace(ST,
+ PtrTy.getAddressSpace(),
+ Op == G_LOAD);
+
+ // FIXME: Handle widened to power of 2 results better. This ends
+ // up scalarizing.
+ // FIXME: 3 element stores scalarized on SI
// Split if it's too large for the address space.
if (Query.MMODescrs[0].SizeInBits > MaxSize) {
unsigned NumElts = DstTy.getNumElements();
+ unsigned EltSize = EltTy.getSizeInBits();
+
+ if (MaxSize % EltSize == 0) {
+ return std::make_pair(
+ 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
+ }
+
unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
// FIXME: Refine when odd breakdowns handled
@@ -802,9 +1085,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
LLT::vector(NumElts / NumPieces, EltTy));
}
+ // FIXME: We could probably handle weird extending loads better.
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ if (DstTy.getSizeInBits() > MemSize)
+ return std::make_pair(0, EltTy);
+
+ unsigned EltSize = EltTy.getSizeInBits();
+ unsigned DstSize = DstTy.getSizeInBits();
+ if (!isPowerOf2_32(DstSize)) {
+ // We're probably decomposing an odd sized store. Try to split
+ // to the widest type. TODO: Account for alignment. As-is it
+ // should be OK, since the new parts will be further legalized.
+ unsigned FloorSize = PowerOf2Floor(DstSize);
+ return std::make_pair(
+ 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
+ }
+
// Need to split because of alignment.
unsigned Align = Query.MMODescrs[0].AlignInBits;
- unsigned EltSize = EltTy.getSizeInBits();
if (EltSize > Align &&
(EltSize / Align < DstTy.getNumElements())) {
return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
@@ -820,39 +1118,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: Need a bitcast lower option?
Actions
- .legalIf([=](const LegalityQuery &Query) {
- const LLT Ty0 = Query.Types[0];
- unsigned Size = Ty0.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- unsigned Align = Query.MMODescrs[0].AlignInBits;
-
- // FIXME: Widening store from alignment not valid.
- if (MemSize < Size)
- MemSize = std::max(MemSize, Align);
-
- // No extending vector loads.
- if (Size > MemSize && Ty0.isVector())
- return false;
-
- switch (MemSize) {
- case 8:
- case 16:
- return Size == 32;
- case 32:
- case 64:
- case 128:
- return true;
- case 96:
- return ST.hasDwordx3LoadStores();
- case 256:
- case 512:
- return true;
- default:
- return false;
- }
- })
.widenScalarToNextPow2(0)
- // TODO: v3s32->v4s32 with alignment
.moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
}
@@ -886,8 +1152,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
- getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
- .legalFor({{S32, LocalPtr}});
+ if (ST.hasLDSFPAtomics()) {
+ getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
+ .legalFor({{S32, LocalPtr}});
+ }
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
@@ -896,10 +1164,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
{S32, FlatPtr}, {S64, FlatPtr}})
.legalFor({{S32, LocalPtr}, {S64, LocalPtr},
{S32, RegionPtr}, {S64, RegionPtr}});
-
- getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
- .lower();
-
// TODO: Pointer types, any 32-bit or 64-bit vector
// Condition should be s32 for scalar, s1 for vector.
@@ -908,9 +1172,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
.clampScalar(0, S16, S64)
+ .scalarize(1)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
- .scalarize(1)
.clampMaxNumElements(0, S32, 2)
.clampMaxNumElements(0, LocalPtr, 2)
.clampMaxNumElements(0, PrivatePtr, 2)
@@ -924,12 +1188,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor({{S32, S32}, {S64, S32}});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts()) {
- Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
+ Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
.clampMaxNumElements(0, S16, 2);
} else
- Shifts.legalFor({{S16, S32}, {S16, S16}});
+ Shifts.legalFor({{S16, S16}});
- // TODO: Support 16-bit shift amounts
+ // TODO: Support 16-bit shift amounts for all types
+ Shifts.widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
+ // 32-bit amount.
+ const LLT ValTy = Query.Types[0];
+ const LLT AmountTy = Query.Types[1];
+ return ValTy.getSizeInBits() <= 16 &&
+ AmountTy.getSizeInBits() < 16;
+ }, changeTo(1, S16));
+ Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S16, S64);
Shifts.widenScalarToNextPow2(0, 16);
@@ -956,7 +1230,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return (EltTy.getSizeInBits() == 16 ||
EltTy.getSizeInBits() % 32 == 0) &&
VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= 1024 &&
+ VecTy.getSizeInBits() <= MaxRegisterSize &&
IdxTy.getSizeInBits() == 32;
})
.clampScalar(EltTypeIdx, S32, S64)
@@ -1008,28 +1282,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampNumElements(0, V2S64, V16S64)
.fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
- if (ST.hasScalarPackInsts())
- BuildVector.legalFor({V2S16, S32});
-
- BuildVector
- .minScalarSameAs(1, 0)
- .legalIf(isRegisterType(0))
- .minScalarOrElt(0, S32);
-
if (ST.hasScalarPackInsts()) {
+ BuildVector
+ // FIXME: Should probably widen s1 vectors straight to s32
+ .minScalarOrElt(0, S16)
+ // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
+ .minScalar(1, S32);
+
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
.legalFor({V2S16, S32})
.lower();
+ BuildVector.minScalarOrElt(0, S32);
} else {
+ BuildVector.customFor({V2S16, S16});
+ BuildVector.minScalarOrElt(0, S32);
+
getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+ .customFor({V2S16, S32})
.lower();
}
+ BuildVector.legalIf(isRegisterType(0));
+
+ // FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalIf(isRegisterType(0));
- // TODO: Don't fully scalarize v2s16 pieces
- getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
+ // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
+ // pre-legalize.
+ if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
+ .customFor({V2S16, V2S16})
+ .lower();
+ } else
+ getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
@@ -1037,10 +1323,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
- const LLT &Ty = Query.Types[TypeIdx];
+ const LLT Ty = Query.Types[TypeIdx];
if (Ty.isVector()) {
const LLT &EltTy = Ty.getElementType();
- if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
+ if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
return true;
if (!isPowerOf2_32(EltTy.getSizeInBits()))
return true;
@@ -1049,25 +1335,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
auto &Builder = getActionDefinitionsBuilder(Op)
+ .lowerFor({{S16, V2S16}})
+ .lowerIf([=](const LegalityQuery &Query) {
+ const LLT BigTy = Query.Types[BigTyIdx];
+ return BigTy.getSizeInBits() == 32;
+ })
+ // Try to widen to s16 first for small types.
+ // TODO: Only do this on targets with legal s16 shifts
+ .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
- // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
- // worth considering the multiples of 64 since 2*192 and 2*384 are not
- // valid.
- .clampScalar(LitTyIdx, S16, S256)
- .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
.moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
.fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
elementTypeIs(1, S16)),
changeTo(1, V2S16))
+ // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
+ // worth considering the multiples of 64 since 2*192 and 2*384 are not
+ // valid.
+ .clampScalar(LitTyIdx, S32, S512)
+ .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
// Break up vectors with weird elements into scalars
.fewerElementsIf(
- [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
+ [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
scalarize(0))
.fewerElementsIf(
- [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
+ [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
scalarize(1))
- .clampScalar(BigTyIdx, S32, S1024)
- .lowerFor({{S16, V2S16}});
+ .clampScalar(BigTyIdx, S32, MaxScalar);
if (Op == G_MERGE_VALUES) {
Builder.widenScalarIf(
@@ -1108,22 +1401,68 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return BigTy.getSizeInBits() % 16 == 0 &&
LitTy.getSizeInBits() % 16 == 0 &&
- BigTy.getSizeInBits() <= 1024;
+ BigTy.getSizeInBits() <= MaxRegisterSize;
})
// Any vectors left are the wrong size. Scalarize them.
.scalarize(0)
.scalarize(1);
}
- getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+ // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
+ // RegBankSelect.
+ auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
+ .legalFor({{S32}, {S64}});
+
+ if (ST.hasVOP3PInsts()) {
+ SextInReg.lowerFor({{V2S16}})
+ // Prefer to reduce vector widths for 16-bit vectors before lowering, to
+ // get more vector shift opportunities, since we'll get those when
+ // expanded.
+ .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
+ } else if (ST.has16BitInsts()) {
+ SextInReg.lowerFor({{S32}, {S64}, {S16}});
+ } else {
+ // Prefer to promote to s32 before lowering if we don't have 16-bit
+ // shifts. This avoid a lot of intermediate truncate and extend operations.
+ SextInReg.lowerFor({{S32}, {S64}});
+ }
+
+ // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
+ // available, and is selectively legal for s16, s32, v2s16.
+ getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
+ .scalarize(0)
+ .clampScalar(0, S16, S32);
- getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
+ SextInReg
+ .scalarize(0)
+ .clampScalar(0, S32, S64)
+ .lower();
+
+ getActionDefinitionsBuilder(G_FSHR)
+ .legalFor({{S32, S32}})
+ .scalarize(0)
+ .lower();
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
.legalFor({S64});
+ getActionDefinitionsBuilder({
+ // TODO: Verify V_BFI_B32 is generated from expanded bit ops
+ G_FCOPYSIGN,
+
+ G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+ G_READ_REGISTER,
+ G_WRITE_REGISTER,
+
+ G_SADDO, G_SSUBO,
+
+ // TODO: Implement
+ G_FMINIMUM, G_FMAXIMUM,
+ G_FSHL
+ }).lower();
+
getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
- G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
+ G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
.unsupported();
@@ -1131,10 +1470,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
verify(*ST.getInstrInfo());
}
-bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
- GISelChangeObserver &Observer) const {
+bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ GISelChangeObserver &Observer = Helper.Observer;
+
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
return legalizeAddrSpaceCast(MI, MRI, B);
@@ -1148,15 +1489,21 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
return legalizeITOFP(MI, MRI, B, true);
case TargetOpcode::G_UITOFP:
return legalizeITOFP(MI, MRI, B, false);
+ case TargetOpcode::G_FPTOSI:
+ return legalizeFPTOI(MI, MRI, B, true);
+ case TargetOpcode::G_FPTOUI:
+ return legalizeFPTOI(MI, MRI, B, false);
case TargetOpcode::G_FMINNUM:
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINNUM_IEEE:
case TargetOpcode::G_FMAXNUM_IEEE:
- return legalizeMinNumMaxNum(MI, MRI, B);
+ return legalizeMinNumMaxNum(Helper, MI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, B);
case TargetOpcode::G_INSERT_VECTOR_ELT:
return legalizeInsertVectorElt(MI, MRI, B);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return legalizeShuffleVector(MI, MRI, B);
case TargetOpcode::G_FSIN:
case TargetOpcode::G_FCOS:
return legalizeSinCos(MI, MRI, B);
@@ -1168,8 +1515,26 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
return legalizeFMad(MI, MRI, B);
case TargetOpcode::G_FDIV:
return legalizeFDIV(MI, MRI, B);
+ case TargetOpcode::G_UDIV:
+ case TargetOpcode::G_UREM:
+ return legalizeUDIV_UREM(MI, MRI, B);
+ case TargetOpcode::G_SDIV:
+ case TargetOpcode::G_SREM:
+ return legalizeSDIV_SREM(MI, MRI, B);
case TargetOpcode::G_ATOMIC_CMPXCHG:
return legalizeAtomicCmpXChg(MI, MRI, B);
+ case TargetOpcode::G_FLOG:
+ return legalizeFlog(MI, B, numbers::ln2f);
+ case TargetOpcode::G_FLOG10:
+ return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
+ case TargetOpcode::G_FEXP:
+ return legalizeFExp(MI, B);
+ case TargetOpcode::G_FPOW:
+ return legalizeFPow(MI, B);
+ case TargetOpcode::G_FFLOOR:
+ return legalizeFFloor(MI, MRI, B);
+ case TargetOpcode::G_BUILD_VECTOR:
+ return legalizeBuildVector(MI, MRI, B);
default:
return false;
}
@@ -1201,7 +1566,6 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
- Register ApertureReg = MRI.createGenericVirtualRegister(S32);
Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::S_GETREG_B32)
@@ -1210,12 +1574,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
MRI.setType(GetReg, S32);
auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
- B.buildInstr(TargetOpcode::G_SHL)
- .addDef(ApertureReg)
- .addUse(GetReg)
- .addUse(ShiftAmt.getReg(0));
-
- return ApertureReg;
+ return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
}
Register QueuePtr = MRI.createGenericVirtualRegister(
@@ -1232,19 +1591,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
// TODO: can we be smarter about machine pointer info?
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
MachineMemOperand *MMO = MF.getMachineMemOperand(
- PtrInfo,
- MachineMemOperand::MOLoad |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- 4,
- MinAlign(64, StructOffset));
-
- Register LoadResult = MRI.createGenericVirtualRegister(S32);
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ 4, commonAlignment(Align(64), StructOffset));
+
Register LoadAddr;
B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
- B.buildLoad(LoadResult, LoadAddr, *MMO);
- return LoadResult;
+ return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
}
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
@@ -1252,8 +1607,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineIRBuilder &B) const {
MachineFunction &MF = B.getMF();
- B.setInstr(MI);
-
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
@@ -1292,7 +1645,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
// extra ptrtoint would be kind of pointless.
auto HighAddr = B.buildConstant(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
- B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
+ B.buildMerge(Dst, {Src, HighAddr});
MI.eraseFromParent();
return true;
}
@@ -1305,13 +1658,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
auto SegmentNull = B.buildConstant(DstTy, NullVal);
auto FlatNull = B.buildConstant(SrcTy, 0);
- Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
-
// Extract low 32-bits of the pointer.
- B.buildExtract(PtrLo32, Src, 0);
+ auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
- Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
- B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+ auto CmpRes =
+ B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
MI.eraseFromParent();
@@ -1333,21 +1684,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
if (!ApertureReg.isValid())
return false;
- Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
- B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
-
- Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
+ auto CmpRes =
+ B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
// Coerce the type of the low half of the result so we can use merge_values.
- Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
- B.buildInstr(TargetOpcode::G_PTRTOINT)
- .addDef(SrcAsInt)
- .addUse(Src);
+ Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
- B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
- B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+ auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+ B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
MI.eraseFromParent();
return true;
@@ -1356,8 +1702,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
bool AMDGPULegalizerInfo::legalizeFrint(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
-
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
@@ -1383,7 +1727,6 @@ bool AMDGPULegalizerInfo::legalizeFrint(
bool AMDGPULegalizerInfo::legalizeFceil(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
const LLT S1 = LLT::scalar(1);
const LLT S64 = LLT::scalar(64);
@@ -1395,7 +1738,7 @@ bool AMDGPULegalizerInfo::legalizeFceil(
// if (src > 0.0 && src != result)
// result += 1.0
- auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
+ auto Trunc = B.buildIntrinsicTrunc(S64, Src);
const auto Zero = B.buildFConstant(S64, 0.0);
const auto One = B.buildFConstant(S64, 1.0);
@@ -1428,8 +1771,6 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi,
bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
-
const LLT S1 = LLT::scalar(1);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
@@ -1456,7 +1797,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
const auto Zero32 = B.buildConstant(S32, 0);
// Extend back to 64-bits.
- auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
+ auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
auto Shr = B.buildAShr(S64, FractMask, Exp);
auto Not = B.buildNot(S64, Shr);
@@ -1474,7 +1815,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
bool AMDGPULegalizerInfo::legalizeITOFP(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool Signed) const {
- B.setInstr(MI);
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
@@ -1503,10 +1843,44 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
return true;
}
-bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
+// TODO: Copied from DAG implementation. Verify logic and document how this
+// actually works.
+bool AMDGPULegalizerInfo::legalizeFPTOI(
MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
- MachineFunction &MF = B.getMF();
+ MachineIRBuilder &B, bool Signed) const {
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ const LLT S64 = LLT::scalar(64);
+ const LLT S32 = LLT::scalar(32);
+
+ assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+
+ unsigned Flags = MI.getFlags();
+
+ auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
+ auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
+ auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
+
+ auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
+ auto FloorMul = B.buildFFloor(S64, Mul, Flags);
+ auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
+
+ auto Hi = Signed ?
+ B.buildFPTOSI(S32, FloorMul) :
+ B.buildFPTOUI(S32, FloorMul);
+ auto Lo = B.buildFPTOUI(S32, Fma);
+
+ B.buildMerge(Dst, { Lo, Hi });
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineFunction &MF = Helper.MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
@@ -1520,10 +1894,6 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
if (IsIEEEOp)
return true;
- MachineIRBuilder HelperBuilder(MI);
- GISelObserverWrapper DummyObserver;
- LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
- HelperBuilder.setInstr(MI);
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
}
@@ -1533,8 +1903,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
// TODO: Should move some of this into LegalizerHelper.
// TODO: Promote dynamic indexing of s16 to s32
- // TODO: Dynamic s64 indexing is only legal for SGPR.
- Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
+
+ // FIXME: Artifact combiner probably should have replaced the truncated
+ // constant before this, so we shouldn't need
+ // getConstantVRegValWithLookThrough.
+ Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
+ MI.getOperand(2).getReg(), MRI);
if (!IdxVal) // Dynamic case will be selected to register indexing.
return true;
@@ -1545,10 +1919,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Dst));
- B.setInstr(MI);
-
- if (IdxVal.getValue() < VecTy.getNumElements())
- B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
+ if (IdxVal->Value < VecTy.getNumElements())
+ B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
@@ -1562,8 +1934,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
// TODO: Should move some of this into LegalizerHelper.
// TODO: Promote dynamic indexing of s16 to s32
- // TODO: Dynamic s64 indexing is only legal for SGPR.
- Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
+
+ // FIXME: Artifact combiner probably should have replaced the truncated
+ // constant before this, so we shouldn't need
+ // getConstantVRegValWithLookThrough.
+ Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
+ MI.getOperand(3).getReg(), MRI);
if (!IdxVal) // Dynamic case will be selected to register indexing.
return true;
@@ -1575,10 +1951,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Ins));
- B.setInstr(MI);
-
- if (IdxVal.getValue() < VecTy.getNumElements())
- B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
+ if (IdxVal->Value < VecTy.getNumElements())
+ B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
@@ -1586,10 +1960,29 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
return true;
}
+bool AMDGPULegalizerInfo::legalizeShuffleVector(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const LLT V2S16 = LLT::vector(2, 16);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src0);
+
+ if (SrcTy == V2S16 && DstTy == V2S16 &&
+ AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
+ return true;
+
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
+ return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
+}
+
bool AMDGPULegalizerInfo::legalizeSinCos(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
@@ -1597,7 +1990,7 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
unsigned Flags = MI.getFlags();
Register TrigVal;
- auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
+ auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
if (ST.hasTrigReducedRange()) {
auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
@@ -1615,10 +2008,12 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
return true;
}
-bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
- Register DstReg, LLT PtrTy,
- MachineIRBuilder &B, const GlobalValue *GV,
- unsigned Offset, unsigned GAFlags) const {
+bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
+ MachineIRBuilder &B,
+ const GlobalValue *GV,
+ int64_t Offset,
+ unsigned GAFlags) const {
+ assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
// to the following code sequence:
//
@@ -1681,19 +2076,37 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
const GlobalValue *GV = MI.getOperand(1).getGlobal();
MachineFunction &MF = B.getMF();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- B.setInstr(MI);
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isEntryFunction()) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
- Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
+ Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
+ DS_Warning);
Fn.getContext().diagnose(BadLDSDecl);
+
+ // We currently don't have a way to correctly allocate LDS objects that
+ // aren't directly associated with a kernel. We do force inlining of
+ // functions that use local objects. However, if these dead functions are
+ // not eliminated, we don't want a compile time error. Just emit a warning
+ // and a trap, since there should be no callable path here.
+ B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
+ B.buildUndef(DstReg);
+ MI.eraseFromParent();
+ return true;
}
// TODO: We could emit code to handle the initialization somewhere.
if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
- B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ if (!TLI->shouldUseLDSConstAddress(GV)) {
+ MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
+ return true; // Leave in place;
+ }
+
+ B.buildConstant(
+ DstReg,
+ MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
MI.eraseFromParent();
return true;
}
@@ -1723,10 +2136,10 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
- MachinePointerInfo::getGOT(MF),
- MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant,
- 8 /*Size*/, 8 /*Align*/);
+ MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ 8 /*Size*/, Align(8));
buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
@@ -1744,7 +2157,6 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
bool AMDGPULegalizerInfo::legalizeLoad(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, GISelChangeObserver &Observer) const {
- B.setInstr(MI);
LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
Observer.changingInstr(MI);
@@ -1763,16 +2175,15 @@ bool AMDGPULegalizerInfo::legalizeFMad(
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// TODO: Always legal with future ftz flag.
- if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
+ // FIXME: Do we need just output?
+ if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
return true;
- if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
+ if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
return true;
-
MachineIRBuilder HelperBuilder(MI);
GISelObserverWrapper DummyObserver;
LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
- HelperBuilder.setMBB(*MI.getParent());
return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
}
@@ -1790,7 +2201,6 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
LLT ValTy = MRI.getType(CmpVal);
LLT VecTy = LLT::vector(2, ValTy);
- B.setInstr(MI);
Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
@@ -1803,39 +2213,248 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
return true;
}
+bool AMDGPULegalizerInfo::legalizeFlog(
+ MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ LLT Ty = B.getMRI()->getType(Dst);
+ unsigned Flags = MI.getFlags();
+
+ auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
+ auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
+
+ B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ unsigned Flags = MI.getFlags();
+ LLT Ty = B.getMRI()->getType(Dst);
+
+ auto K = B.buildFConstant(Ty, numbers::log2e);
+ auto Mul = B.buildFMul(Ty, Src, K, Flags);
+ B.buildFExp2(Dst, Mul, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ unsigned Flags = MI.getFlags();
+ LLT Ty = B.getMRI()->getType(Dst);
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+
+ if (Ty == S32) {
+ auto Log = B.buildFLog2(S32, Src0, Flags);
+ auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
+ .addUse(Log.getReg(0))
+ .addUse(Src1)
+ .setMIFlags(Flags);
+ B.buildFExp2(Dst, Mul, Flags);
+ } else if (Ty == S16) {
+ // There's no f16 fmul_legacy, so we need to convert for it.
+ auto Log = B.buildFLog2(S16, Src0, Flags);
+ auto Ext0 = B.buildFPExt(S32, Log, Flags);
+ auto Ext1 = B.buildFPExt(S32, Src1, Flags);
+ auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
+ .addUse(Ext0.getReg(0))
+ .addUse(Ext1.getReg(0))
+ .setMIFlags(Flags);
+
+ B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
+ } else
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
+// Find a source register, ignoring any possible source modifiers.
+static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
+ Register ModSrc = OrigSrc;
+ if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
+ ModSrc = SrcFNeg->getOperand(1).getReg();
+ if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
+ ModSrc = SrcFAbs->getOperand(1).getReg();
+ } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
+ ModSrc = SrcFAbs->getOperand(1).getReg();
+ return ModSrc;
+}
+
+bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S64 = LLT::scalar(64);
+ Register Dst = MI.getOperand(0).getReg();
+ Register OrigSrc = MI.getOperand(1).getReg();
+ unsigned Flags = MI.getFlags();
+ assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
+ "this should not have been custom lowered");
+
+ // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
+ // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
+ // efficient way to implement it is using V_FRACT_F64. The workaround for the
+ // V_FRACT bug is:
+ // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
+ //
+ // Convert floor(x) to (x - fract(x))
+
+ auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
+ .addUse(OrigSrc)
+ .setMIFlags(Flags);
+
+ // Give source modifier matching some assistance before obscuring a foldable
+ // pattern.
+
+ // TODO: We can avoid the neg on the fract? The input sign to fract
+ // shouldn't matter?
+ Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
+
+ auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
+
+ Register Min = MRI.createGenericVirtualRegister(S64);
+
+ // We don't need to concern ourselves with the snan handling difference, so
+ // use the one which will directly select.
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (MFI->getMode().IEEE)
+ B.buildFMinNumIEEE(Min, Fract, Const, Flags);
+ else
+ B.buildFMinNum(Min, Fract, Const, Flags);
+
+ Register CorrectedFract = Min;
+ if (!MI.getFlag(MachineInstr::FmNoNans)) {
+ auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
+ CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
+ }
+
+ auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
+ B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+// Turn an illegal packed v2s16 build vector into bit operations.
+// TODO: This should probably be a bitcast action in LegalizerHelper.
+bool AMDGPULegalizerInfo::legalizeBuildVector(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ Register Dst = MI.getOperand(0).getReg();
+ const LLT S32 = LLT::scalar(32);
+ assert(MRI.getType(Dst) == LLT::vector(2, 16));
+
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+ assert(MRI.getType(Src0) == LLT::scalar(16));
+
+ auto Merge = B.buildMerge(S32, {Src0, Src1});
+ B.buildBitcast(Dst, Merge);
+
+ MI.eraseFromParent();
+ return true;
+}
+
// Return the use branch instruction, otherwise null if the usage is invalid.
static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
- MachineInstr *&Br) {
+ MachineInstr *&Br,
+ MachineBasicBlock *&UncondBrTarget) {
Register CondDef = MI.getOperand(0).getReg();
if (!MRI.hasOneNonDBGUse(CondDef))
return nullptr;
+ MachineBasicBlock *Parent = MI.getParent();
MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
- if (UseMI.getParent() != MI.getParent() ||
+ if (UseMI.getParent() != Parent ||
UseMI.getOpcode() != AMDGPU::G_BRCOND)
return nullptr;
- // Make sure the cond br is followed by a G_BR
+ // Make sure the cond br is followed by a G_BR, or is the last instruction.
MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
- if (Next != MI.getParent()->end()) {
+ if (Next == Parent->end()) {
+ MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
+ if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
+ return nullptr;
+ UncondBrTarget = &*NextMBB;
+ } else {
if (Next->getOpcode() != AMDGPU::G_BR)
return nullptr;
Br = &*Next;
+ UncondBrTarget = Br->getOperand(0).getMBB();
}
return &UseMI;
}
-Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
- Register Reg, LLT Ty) const {
- Register LiveIn = MRI.getLiveInVirtReg(Reg);
- if (LiveIn)
+Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register LiveIn,
+ Register PhyReg) const {
+ assert(PhyReg.isPhysical() && "Physical register expected");
+
+ // Insert the live-in copy, if required, by defining destination virtual
+ // register.
+ // FIXME: It seems EmitLiveInCopies isn't called anywhere?
+ if (!MRI.getVRegDef(LiveIn)) {
+ // FIXME: Should have scoped insert pt
+ MachineBasicBlock &OrigInsBB = B.getMBB();
+ auto OrigInsPt = B.getInsertPt();
+
+ MachineBasicBlock &EntryMBB = B.getMF().front();
+ EntryMBB.addLiveIn(PhyReg);
+ B.setInsertPt(EntryMBB, EntryMBB.begin());
+ B.buildCopy(LiveIn, PhyReg);
+
+ B.setInsertPt(OrigInsBB, OrigInsPt);
+ }
+
+ return LiveIn;
+}
+
+Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register PhyReg, LLT Ty,
+ bool InsertLiveInCopy) const {
+ assert(PhyReg.isPhysical() && "Physical register expected");
+
+ // Get or create virtual live-in regester
+ Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
+ if (!LiveIn) {
+ LiveIn = MRI.createGenericVirtualRegister(Ty);
+ MRI.addLiveIn(PhyReg, LiveIn);
+ }
+
+ // When the actual true copy required is from virtual register to physical
+ // register (to be inserted later), live-in copy insertion from physical
+ // to register virtual register is not required
+ if (!InsertLiveInCopy)
return LiveIn;
- Register NewReg = MRI.createGenericVirtualRegister(Ty);
- MRI.addLiveIn(Reg, NewReg);
- return NewReg;
+ return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
+}
+
+const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
+ MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *RC;
+ LLT ArgTy;
+ std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
+ if (!Arg) {
+ LLVM_DEBUG(dbgs() << "Required arg register missing\n");
+ return nullptr;
+ }
+ return Arg;
}
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
@@ -1843,12 +2462,14 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
if (!Arg->isRegister() || !Arg->getRegister().isValid())
return false; // TODO: Handle these
- assert(Arg->getRegister().isPhysical());
+ Register SrcReg = Arg->getRegister();
+ assert(SrcReg.isPhysical() && "Physical register expected");
+ assert(DstReg.isVirtual() && "Virtual register expected");
MachineRegisterInfo &MRI = *B.getMRI();
LLT Ty = MRI.getType(DstReg);
- Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
+ Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
if (Arg->isMasked()) {
// TODO: Should we try to emit this once in the entry block?
@@ -1864,56 +2485,31 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
}
B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
- } else
+ } else {
B.buildCopy(DstReg, LiveIn);
-
- // Insert the argument copy if it doens't already exist.
- // FIXME: It seems EmitLiveInCopies isn't called anywhere?
- if (!MRI.getVRegDef(LiveIn)) {
- // FIXME: Should have scoped insert pt
- MachineBasicBlock &OrigInsBB = B.getMBB();
- auto OrigInsPt = B.getInsertPt();
-
- MachineBasicBlock &EntryMBB = B.getMF().front();
- EntryMBB.addLiveIn(Arg->getRegister());
- B.setInsertPt(EntryMBB, EntryMBB.begin());
- B.buildCopy(LiveIn, Arg->getRegister());
-
- B.setInsertPt(OrigInsBB, OrigInsPt);
}
return true;
}
bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
- MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
- AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
- B.setInstr(MI);
-
- const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
- const ArgDescriptor *Arg;
- const TargetRegisterClass *RC;
- std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
- if (!Arg) {
- LLVM_DEBUG(dbgs() << "Required arg register missing\n");
+ const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
+ if (!Arg)
return false;
- }
- if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
- MI.eraseFromParent();
- return true;
- }
+ if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
+ return false;
- return false;
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
LLT S16 = LLT::scalar(16);
@@ -1933,6 +2529,284 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
return false;
}
+void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
+ Register DstReg,
+ Register X,
+ Register Y,
+ bool IsDiv) const {
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+
+ // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
+ // algorithm used here.
+
+ // Initial estimate of inv(y).
+ auto FloatY = B.buildUITOFP(S32, Y);
+ auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
+ auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
+ auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
+ auto Z = B.buildFPTOUI(S32, ScaledY);
+
+ // One round of UNR.
+ auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
+ auto NegYZ = B.buildMul(S32, NegY, Z);
+ Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
+
+ // Quotient/remainder estimate.
+ auto Q = B.buildUMulH(S32, X, Z);
+ auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
+
+ // First quotient/remainder refinement.
+ auto One = B.buildConstant(S32, 1);
+ auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+ if (IsDiv)
+ Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
+ R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
+
+ // Second quotient/remainder refinement.
+ Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+ if (IsDiv)
+ B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
+ else
+ B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
+}
+
+bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Num = MI.getOperand(1).getReg();
+ Register Den = MI.getOperand(2).getReg();
+ legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
+ MI.eraseFromParent();
+ return true;
+}
+
+// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
+//
+// Return lo, hi of result
+//
+// %cvt.lo = G_UITOFP Val.lo
+// %cvt.hi = G_UITOFP Val.hi
+// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
+// %rcp = G_AMDGPU_RCP_IFLAG %mad
+// %mul1 = G_FMUL %rcp, 0x5f7ffffc
+// %mul2 = G_FMUL %mul1, 2**(-32)
+// %trunc = G_INTRINSIC_TRUNC %mul2
+// %mad2 = G_FMAD %trunc, -(2**32), %mul1
+// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
+static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
+ Register Val) {
+ const LLT S32 = LLT::scalar(32);
+ auto Unmerge = B.buildUnmerge(S32, Val);
+
+ auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
+ auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
+
+ auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
+ B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
+
+ auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
+ auto Mul1 =
+ B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
+
+ // 2**(-32)
+ auto Mul2 =
+ B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
+ auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
+
+ // -(2**32)
+ auto Mad2 = B.buildFMAD(S32, Trunc,
+ B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
+
+ auto ResultLo = B.buildFPTOUI(S32, Mad2);
+ auto ResultHi = B.buildFPTOUI(S32, Trunc);
+
+ return {ResultLo.getReg(0), ResultHi.getReg(0)};
+}
+
+void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
+ Register DstReg,
+ Register Numer,
+ Register Denom,
+ bool IsDiv) const {
+ const LLT S32 = LLT::scalar(32);
+ const LLT S64 = LLT::scalar(64);
+ const LLT S1 = LLT::scalar(1);
+ Register RcpLo, RcpHi;
+
+ std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
+
+ auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
+
+ auto Zero64 = B.buildConstant(S64, 0);
+ auto NegDenom = B.buildSub(S64, Zero64, Denom);
+
+ auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
+ auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
+
+ auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
+ Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
+ Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
+
+ auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
+ auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
+ auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
+ auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
+
+ auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
+ auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
+ auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
+ Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
+ Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
+
+ auto Zero32 = B.buildConstant(S32, 0);
+ auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
+ auto Add2_HiC =
+ B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
+ auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
+ auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
+
+ auto UnmergeNumer = B.buildUnmerge(S32, Numer);
+ Register NumerLo = UnmergeNumer.getReg(0);
+ Register NumerHi = UnmergeNumer.getReg(1);
+
+ auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
+ auto Mul3 = B.buildMul(S64, Denom, MulHi3);
+ auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
+ Register Mul3_Lo = UnmergeMul3.getReg(0);
+ Register Mul3_Hi = UnmergeMul3.getReg(1);
+ auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
+ auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
+ auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
+ auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
+
+ auto UnmergeDenom = B.buildUnmerge(S32, Denom);
+ Register DenomLo = UnmergeDenom.getReg(0);
+ Register DenomHi = UnmergeDenom.getReg(1);
+
+ auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
+ auto C1 = B.buildSExt(S32, CmpHi);
+
+ auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
+ auto C2 = B.buildSExt(S32, CmpLo);
+
+ auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
+ auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
+
+ // TODO: Here and below portions of the code can be enclosed into if/endif.
+ // Currently control flow is unconditional and we have 4 selects after
+ // potential endif to substitute PHIs.
+
+ // if C3 != 0 ...
+ auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
+ auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
+ auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
+ auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
+
+ auto One64 = B.buildConstant(S64, 1);
+ auto Add3 = B.buildAdd(S64, MulHi3, One64);
+
+ auto C4 =
+ B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
+ auto C5 =
+ B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
+ auto C6 = B.buildSelect(
+ S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
+
+ // if (C6 != 0)
+ auto Add4 = B.buildAdd(S64, Add3, One64);
+ auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
+
+ auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
+ auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
+ auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
+
+ // endif C6
+ // endif C3
+
+ if (IsDiv) {
+ auto Sel1 = B.buildSelect(
+ S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
+ B.buildSelect(DstReg,
+ B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
+ } else {
+ auto Sel2 = B.buildSelect(
+ S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
+ B.buildSelect(DstReg,
+ B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
+ }
+}
+
+bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const LLT S64 = LLT::scalar(64);
+ const LLT S32 = LLT::scalar(32);
+ const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Num = MI.getOperand(1).getReg();
+ Register Den = MI.getOperand(2).getReg();
+ LLT Ty = MRI.getType(DstReg);
+
+ if (Ty == S32)
+ legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
+ else if (Ty == S64)
+ legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
+ else
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+
+}
+
+bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const LLT S64 = LLT::scalar(64);
+ const LLT S32 = LLT::scalar(32);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ const LLT Ty = MRI.getType(DstReg);
+ if (Ty != S32 && Ty != S64)
+ return false;
+
+ const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
+
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+
+ auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
+ auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
+ auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
+
+ LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
+ RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
+
+ LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
+ RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
+
+ Register UDivRem = MRI.createGenericVirtualRegister(Ty);
+ if (Ty == S32)
+ legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
+ else
+ legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
+
+ Register Sign;
+ if (IsDiv)
+ Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
+ else
+ Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
+
+ UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
+ B.buildSub(DstReg, UDivRem, Sign);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -1954,7 +2828,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
return false;
if (!Unsafe && ResTy == S32 &&
- MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
+ MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
return false;
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -1997,7 +2871,6 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2035,15 +2908,13 @@ static void toggleSPDenormMode(bool Enable,
AMDGPU::SIModeRegisterDefaults Mode) {
// Set SP denorm mode to this value.
unsigned SPDenormMode =
- Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
if (ST.hasDenormModeInst()) {
// Preserve default FP64FP16 denorm mode while updating FP32 mode.
- unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
- ? FP_DENORM_FLUSH_NONE
- : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
- unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
+ uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
B.buildInstr(AMDGPU::S_DENORM_MODE)
.addImm(NewDenormModeValue);
@@ -2062,7 +2933,6 @@ static void toggleSPDenormMode(bool Enable,
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2078,15 +2948,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
auto DenominatorScaled =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
- .addUse(RHS)
.addUse(LHS)
- .addImm(1)
+ .addUse(RHS)
+ .addImm(0)
.setMIFlags(Flags);
auto NumeratorScaled =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
.addUse(LHS)
.addUse(RHS)
- .addImm(0)
+ .addImm(1)
.setMIFlags(Flags);
auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
@@ -2096,7 +2966,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
// FIXME: Doesn't correctly model the FP mode switch, and the FP operations
// aren't modeled as reading it.
- if (!Mode.FP32Denormals)
+ if (!Mode.allFP32Denormals())
toggleSPDenormMode(true, B, ST, Mode);
auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
@@ -2106,7 +2976,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
- if (!Mode.FP32Denormals)
+ if (!Mode.allFP32Denormals())
toggleSPDenormMode(false, B, ST, Mode);
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
@@ -2129,7 +2999,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2144,7 +3013,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
.addUse(LHS)
.addUse(RHS)
- .addImm(1)
+ .addImm(0)
.setMIFlags(Flags);
auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
@@ -2160,11 +3029,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
.addUse(LHS)
.addUse(RHS)
- .addImm(0)
+ .addImm(1)
.setMIFlags(Flags);
auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
- auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
+ auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
Register Scale;
@@ -2172,8 +3041,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
- Scale = MRI.createGenericVirtualRegister(S1);
-
LLT S32 = LLT::scalar(32);
auto NumUnmerge = B.buildUnmerge(S32, LHS);
@@ -2185,7 +3052,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
Scale1Unmerge.getReg(1));
auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
Scale0Unmerge.getReg(1));
- B.buildXor(Scale, CmpNum, CmpDen);
+ Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
} else {
Scale = DivScale1.getReg(1);
}
@@ -2210,7 +3077,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
- B.setInstr(MI);
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(2).getReg();
Register RHS = MI.getOperand(3).getReg();
@@ -2252,8 +3118,6 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
- B.setInstr(MI);
-
uint64_t Offset =
ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
@@ -2263,8 +3127,9 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
const ArgDescriptor *Arg;
const TargetRegisterClass *RC;
- std::tie(Arg, RC)
- = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ LLT ArgTy;
+ std::tie(Arg, RC, ArgTy) =
+ MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
if (!Arg)
return false;
@@ -2281,7 +3146,6 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
- B.setInstr(MI);
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
@@ -2289,6 +3153,55 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
return true;
}
+// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
+// offset (the offset that is included in bounds checking and swizzling, to be
+// split between the instruction's voffset and immoffset fields) and soffset
+// (the offset that is excluded from bounds checking and swizzling, to go in
+// the instruction's soffset field). This function takes the first kind of
+// offset and figures out how to split it between voffset and immoffset.
+std::tuple<Register, unsigned, unsigned>
+AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const {
+ const unsigned MaxImm = 4095;
+ Register BaseReg;
+ unsigned TotalConstOffset;
+ MachineInstr *OffsetDef;
+ const LLT S32 = LLT::scalar(32);
+
+ std::tie(BaseReg, TotalConstOffset, OffsetDef)
+ = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+
+ unsigned ImmOffset = TotalConstOffset;
+
+ // If the immediate value is too big for the immoffset field, put the value
+ // and -4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.
+ // However, do not do that rounding down to a multiple of 4096 if that is a
+ // negative number, as it appears to be illegal to have a negative offset
+ // in the vgpr, even if adding the immediate offset makes it positive.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ if ((int32_t)Overflow < 0) {
+ Overflow += ImmOffset;
+ ImmOffset = 0;
+ }
+
+ if (Overflow != 0) {
+ if (!BaseReg) {
+ BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+ } else {
+ auto OverflowVal = B.buildConstant(S32, Overflow);
+ BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+ }
+ }
+
+ if (!BaseReg)
+ BaseReg = B.buildConstant(S32, 0).getReg(0);
+
+ return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+}
+
/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
@@ -2312,75 +3225,969 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
}
-bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
- bool IsFormat) const {
- // TODO: Reject f16 format on targets where unsupported.
- Register VData = MI.getOperand(1).getReg();
- LLT Ty = MRI.getType(VData);
+Register AMDGPULegalizerInfo::fixStoreSourceType(
+ MachineIRBuilder &B, Register VData, bool IsFormat) const {
+ MachineRegisterInfo *MRI = B.getMRI();
+ LLT Ty = MRI->getType(VData);
- B.setInstr(MI);
-
- const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
// Fixup illegal register types for i8 stores.
if (Ty == LLT::scalar(8) || Ty == S16) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
- MI.getOperand(1).setReg(AnyExt);
- return true;
+ return AnyExt;
}
if (Ty.isVector()) {
if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
if (IsFormat)
- MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
+ return handleD16VData(B, *MRI, VData);
+ }
+ }
+
+ return VData;
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool IsTyped,
+ bool IsFormat) const {
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(VData);
+ LLT EltTy = Ty.getScalarType();
+ const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
+ const LLT S32 = LLT::scalar(32);
+
+ VData = fixStoreSourceType(B, VData, IsFormat);
+ Register RSrc = MI.getOperand(2).getReg();
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const int MemSize = MMO->getSize();
+
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+
+ // The typed intrinsics add an immediate after the registers.
+ const unsigned NumVIndexOps = IsTyped ? 8 : 7;
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+ Register VIndex;
+ int OpOffset = 0;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(3).getReg();
+ OpOffset = 1;
+ }
+
+ Register VOffset = MI.getOperand(3 + OpOffset).getReg();
+ Register SOffset = MI.getOperand(4 + OpOffset).getReg();
+
+ unsigned Format = 0;
+ if (IsTyped) {
+ Format = MI.getOperand(5 + OpOffset).getImm();
+ ++OpOffset;
+ }
+
+ unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
+
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+ unsigned Opc;
+ if (IsTyped) {
+ Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
+ AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
+ } else if (IsFormat) {
+ Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
+ AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
+ } else {
+ switch (MemSize) {
+ case 1:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
+ break;
+ default:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
+ break;
+ }
+ }
+
+ if (!VIndex)
+ VIndex = B.buildConstant(S32, 0).getReg(0);
+
+ auto MIB = B.buildInstr(Opc)
+ .addUse(VData) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset); // offset(imm)
+
+ if (IsTyped)
+ MIB.addImm(Format);
+
+ MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
+ .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+ .addMemOperand(MMO);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool IsFormat,
+ bool IsTyped) const {
+ // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const int MemSize = MMO->getSize();
+ const LLT S32 = LLT::scalar(32);
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register RSrc = MI.getOperand(2).getReg();
+
+ // The typed intrinsics add an immediate after the registers.
+ const unsigned NumVIndexOps = IsTyped ? 8 : 7;
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+ Register VIndex;
+ int OpOffset = 0;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(3).getReg();
+ OpOffset = 1;
+ }
+
+ Register VOffset = MI.getOperand(3 + OpOffset).getReg();
+ Register SOffset = MI.getOperand(4 + OpOffset).getReg();
+
+ unsigned Format = 0;
+ if (IsTyped) {
+ Format = MI.getOperand(5 + OpOffset).getImm();
+ ++OpOffset;
+ }
+
+ unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+
+ LLT Ty = MRI.getType(Dst);
+ LLT EltTy = Ty.getScalarType();
+ const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
+ const bool Unpacked = ST.hasUnpackedD16VMem();
+
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+ unsigned Opc;
+
+ if (IsTyped) {
+ Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
+ AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
+ } else if (IsFormat) {
+ Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
+ AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
+ } else {
+ switch (MemSize) {
+ case 1:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+ break;
+ case 2:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+ break;
+ default:
+ Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+ break;
+ }
+ }
+
+ Register LoadDstReg;
+
+ bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
+ LLT UnpackedTy = Ty.changeElementSize(32);
+
+ if (IsExtLoad)
+ LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
+ else if (Unpacked && IsD16 && Ty.isVector())
+ LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
+ else
+ LoadDstReg = Dst;
+
+ if (!VIndex)
+ VIndex = B.buildConstant(S32, 0).getReg(0);
+
+ auto MIB = B.buildInstr(Opc)
+ .addDef(LoadDstReg) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset); // offset(imm)
+
+ if (IsTyped)
+ MIB.addImm(Format);
+
+ MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
+ .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+ .addMemOperand(MMO);
+
+ if (LoadDstReg != Dst) {
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+
+ // Widen result for extending loads was widened.
+ if (IsExtLoad)
+ B.buildTrunc(Dst, LoadDstReg);
+ else {
+ // Repack to original 16-bit vector result
+ // FIXME: G_TRUNC should work, but legalization currently fails
+ auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
+ SmallVector<Register, 4> Repack;
+ for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
+ Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
+ B.buildMerge(Dst, Repack);
+ }
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
+ MachineIRBuilder &B,
+ bool IsInc) const {
+ unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
+ AMDGPU::G_AMDGPU_ATOMIC_DEC;
+ B.buildInstr(Opc)
+ .addDef(MI.getOperand(0).getReg())
+ .addUse(MI.getOperand(2).getReg())
+ .addUse(MI.getOperand(3).getReg())
+ .cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return true;
+}
+
+static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
+ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
+ MachineIRBuilder &B,
+ Intrinsic::ID IID) const {
+ const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
+ IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register VData = MI.getOperand(2).getReg();
+
+ Register CmpVal;
+ int OpOffset = 0;
+
+ if (IsCmpSwap) {
+ CmpVal = MI.getOperand(3 + OpOffset).getReg();
+ ++OpOffset;
+ }
+
+ Register RSrc = MI.getOperand(3 + OpOffset).getReg();
+ const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
+
+ // The struct intrinsic variants add one additional operand over raw.
+ const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+ Register VIndex;
+ if (HasVIndex) {
+ VIndex = MI.getOperand(4 + OpOffset).getReg();
+ ++OpOffset;
+ }
+
+ Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+ Register SOffset = MI.getOperand(5 + OpOffset).getReg();
+ unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
+
+ if (!VIndex)
+ VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+
+ auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
+ .addDef(Dst)
+ .addUse(VData); // vdata
+
+ if (IsCmpSwap)
+ MIB.addReg(CmpVal);
+
+ MIB.addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset) // offset(imm)
+ .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
+ .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+ .addMemOperand(MMO);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
+/// vector with s16 typed elements.
+static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
+ SmallVectorImpl<Register> &PackedAddrs,
+ int AddrIdx, int DimIdx, int EndIdx,
+ int NumGradients) {
+ const LLT S16 = LLT::scalar(16);
+ const LLT V2S16 = LLT::vector(2, 16);
+
+ for (int I = AddrIdx; I < EndIdx; ++I) {
+ MachineOperand &SrcOp = MI.getOperand(I);
+ if (!SrcOp.isReg())
+ continue; // _L to _LZ may have eliminated this.
+
+ Register AddrReg = SrcOp.getReg();
+
+ if (I < DimIdx) {
+ AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+ PackedAddrs.push_back(AddrReg);
+ } else {
+ // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
+ // derivatives dx/dh and dx/dv are packed with undef.
+ if (((I + 1) >= EndIdx) ||
+ ((NumGradients / 2) % 2 == 1 &&
+ (I == DimIdx + (NumGradients / 2) - 1 ||
+ I == DimIdx + NumGradients - 1)) ||
+ // Check for _L to _LZ optimization
+ !MI.getOperand(I + 1).isReg()) {
+ PackedAddrs.push_back(
+ B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+ .getReg(0));
+ } else {
+ PackedAddrs.push_back(
+ B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
+ .getReg(0));
+ ++I;
+ }
+ }
+ }
+}
+
+/// Convert from separate vaddr components to a single vector address register,
+/// and replace the remaining operands with $noreg.
+static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
+ int DimIdx, int NumVAddrs) {
+ const LLT S32 = LLT::scalar(32);
+
+ SmallVector<Register, 8> AddrRegs;
+ for (int I = 0; I != NumVAddrs; ++I) {
+ MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
+ if (SrcOp.isReg()) {
+ AddrRegs.push_back(SrcOp.getReg());
+ assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
+ }
+ }
+
+ int NumAddrRegs = AddrRegs.size();
+ if (NumAddrRegs != 1) {
+ // Round up to 8 elements for v5-v7
+ // FIXME: Missing intermediate sized register classes and instructions.
+ if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
+ const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
+ auto Undef = B.buildUndef(S32);
+ AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
+ NumAddrRegs = RoundedNumRegs;
+ }
+
+ auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
+ MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
+ }
+
+ for (int I = 1; I != NumVAddrs; ++I) {
+ MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
+ if (SrcOp.isReg())
+ MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
+ }
+}
+
+/// Rewrite image intrinsics to use register layouts expected by the subtarget.
+///
+/// Depending on the subtarget, load/store with 16-bit element data need to be
+/// rewritten to use the low half of 32-bit registers, or directly use a packed
+/// layout. 16-bit addresses should also sometimes be packed into 32-bit
+/// registers.
+///
+/// We don't want to directly select image instructions just yet, but also want
+/// to exposes all register repacking to the legalizer/combiners. We also don't
+/// want a selected instrution entering RegBankSelect. In order to avoid
+/// defining a multitude of intermediate image instructions, directly hack on
+/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
+/// now unnecessary arguments with $noreg.
+bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
+ MachineInstr &MI, MachineIRBuilder &B,
+ GISelChangeObserver &Observer,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
+
+ const int NumDefs = MI.getNumExplicitDefs();
+ bool IsTFE = NumDefs == 2;
+ // We are only processing the operands of d16 image operations on subtargets
+ // that use the unpacked register layout, or need to repack the TFE result.
+
+ // TODO: Do we need to guard against already legalized intrinsics?
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+
+ MachineRegisterInfo *MRI = B.getMRI();
+ const LLT S32 = LLT::scalar(32);
+ const LLT S16 = LLT::scalar(16);
+ const LLT V2S16 = LLT::vector(2, 16);
+
+ // Index of first address argument
+ const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
+
+ int NumVAddrs, NumGradients;
+ std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
+ const int DMaskIdx = BaseOpcode->Atomic ? -1 :
+ getDMaskIdx(BaseOpcode, NumDefs);
+ unsigned DMask = 0;
+
+ // Check for 16 bit addresses and pack if true.
+ int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
+ LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
+ LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
+ const bool IsG16 = GradTy == S16;
+ const bool IsA16 = AddrTy == S16;
+
+ int DMaskLanes = 0;
+ if (!BaseOpcode->Atomic) {
+ DMask = MI.getOperand(DMaskIdx).getImm();
+ if (BaseOpcode->Gather4) {
+ DMaskLanes = 4;
+ } else if (DMask != 0) {
+ DMaskLanes = countPopulation(DMask);
+ } else if (!IsTFE && !BaseOpcode->Store) {
+ // If dmask is 0, this is a no-op load. This can be eliminated.
+ B.buildUndef(MI.getOperand(0));
+ MI.eraseFromParent();
return true;
}
+ }
+
+ Observer.changingInstr(MI);
+ auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
+
+ unsigned NewOpcode = NumDefs == 0 ?
+ AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
+
+ // Track that we legalized this
+ MI.setDesc(B.getTII().get(NewOpcode));
+
+ // Expecting to get an error flag since TFC is on - and dmask is 0 Force
+ // dmask to be at least 1 otherwise the instruction will fail
+ if (IsTFE && DMask == 0) {
+ DMask = 0x1;
+ DMaskLanes = 1;
+ MI.getOperand(DMaskIdx).setImm(DMask);
+ }
+
+ if (BaseOpcode->Atomic) {
+ Register VData0 = MI.getOperand(2).getReg();
+ LLT Ty = MRI->getType(VData0);
+
+ // TODO: Allow atomic swap and bit ops for v2s16/v4s16
+ if (Ty.isVector())
+ return false;
+
+ if (BaseOpcode->AtomicX2) {
+ Register VData1 = MI.getOperand(3).getReg();
+ // The two values are packed in one register.
+ LLT PackedTy = LLT::vector(2, Ty);
+ auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
+ MI.getOperand(2).setReg(Concat.getReg(0));
+ MI.getOperand(3).setReg(AMDGPU::NoRegister);
+ }
+ }
- return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
+ int CorrectedNumVAddrs = NumVAddrs;
+
+ // Optimize _L to _LZ when _L is zero
+ if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+ AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+ const ConstantFP *ConstantLod;
+ const int LodIdx = AddrIdx + NumVAddrs - 1;
+
+ if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
+ if (ConstantLod->isZero() || ConstantLod->isNegative()) {
+ // Set new opcode to _lz variant of _l, and change the intrinsic ID.
+ ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
+ LZMappingInfo->LZ, ImageDimIntr->Dim);
+
+ // The starting indexes should remain in the same place.
+ --NumVAddrs;
+ --CorrectedNumVAddrs;
+
+ MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
+ static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
+ MI.RemoveOperand(LodIdx);
+ }
+ }
}
- return Ty == S32;
+ // Optimize _mip away, when 'lod' is zero
+ if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+ int64_t ConstantLod;
+ const int LodIdx = AddrIdx + NumVAddrs - 1;
+
+ if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
+ if (ConstantLod == 0) {
+ // TODO: Change intrinsic opcode and remove operand instead or replacing
+ // it with 0, as the _L to _LZ handling is done above.
+ MI.getOperand(LodIdx).ChangeToImmediate(0);
+ --CorrectedNumVAddrs;
+ }
+ }
+ }
+
+ // Rewrite the addressing register layout before doing anything else.
+ if (IsA16 || IsG16) {
+ if (IsA16) {
+ // Target must support the feature and gradients need to be 16 bit too
+ if (!ST.hasA16() || !IsG16)
+ return false;
+ } else if (!ST.hasG16())
+ return false;
+
+ if (NumVAddrs > 1) {
+ SmallVector<Register, 4> PackedRegs;
+ // Don't compress addresses for G16
+ const int PackEndIdx =
+ IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
+ packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
+ PackEndIdx, NumGradients);
+
+ if (!IsA16) {
+ // Add uncompressed address
+ for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
+ int AddrReg = MI.getOperand(I).getReg();
+ assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
+ PackedRegs.push_back(AddrReg);
+ }
+ }
+
+ // See also below in the non-a16 branch
+ const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
+
+ if (!UseNSA && PackedRegs.size() > 1) {
+ LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
+ auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
+ PackedRegs[0] = Concat.getReg(0);
+ PackedRegs.resize(1);
+ }
+
+ const int NumPacked = PackedRegs.size();
+ for (int I = 0; I != NumVAddrs; ++I) {
+ MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
+ if (!SrcOp.isReg()) {
+ assert(SrcOp.isImm() && SrcOp.getImm() == 0);
+ continue;
+ }
+
+ assert(SrcOp.getReg() != AMDGPU::NoRegister);
+
+ if (I < NumPacked)
+ SrcOp.setReg(PackedRegs[I]);
+ else
+ SrcOp.setReg(AMDGPU::NoRegister);
+ }
+ }
+ } else {
+ // If the register allocator cannot place the address registers contiguously
+ // without introducing moves, then using the non-sequential address encoding
+ // is always preferable, since it saves VALU instructions and is usually a
+ // wash in terms of code size or even better.
+ //
+ // However, we currently have no way of hinting to the register allocator
+ // that MIMG addresses should be placed contiguously when it is possible to
+ // do so, so force non-NSA for the common 2-address case as a heuristic.
+ //
+ // SIShrinkInstructions will convert NSA encodings to non-NSA after register
+ // allocation when possible.
+ const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
+
+ if (!UseNSA && NumVAddrs > 1)
+ convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
+ }
+
+ int Flags = 0;
+ if (IsA16)
+ Flags |= 1;
+ if (IsG16)
+ Flags |= 2;
+ MI.addOperand(MachineOperand::CreateImm(Flags));
+
+ if (BaseOpcode->Store) { // No TFE for stores?
+ // TODO: Handle dmask trim
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI->getType(VData);
+ if (!Ty.isVector() || Ty.getElementType() != S16)
+ return true;
+
+ Register RepackedReg = handleD16VData(B, *MRI, VData);
+ if (RepackedReg != VData) {
+ MI.getOperand(1).setReg(RepackedReg);
+ }
+
+ return true;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT Ty = MRI->getType(DstReg);
+ const LLT EltTy = Ty.getScalarType();
+ const bool IsD16 = Ty.getScalarType() == S16;
+ const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+ // Confirm that the return type is large enough for the dmask specified
+ if (NumElts < DMaskLanes)
+ return false;
+
+ if (NumElts > 4 || DMaskLanes > 4)
+ return false;
+
+ const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
+ const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
+
+ // The raw dword aligned data component of the load. The only legal cases
+ // where this matters should be when using the packed D16 format, for
+ // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
+ LLT RoundedTy;
+
+ // S32 vector to to cover all data, plus TFE result element.
+ LLT TFETy;
+
+ // Register type to use for each loaded component. Will be S32 or V2S16.
+ LLT RegTy;
+
+ if (IsD16 && ST.hasUnpackedD16VMem()) {
+ RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
+ TFETy = LLT::vector(AdjustedNumElts + 1, 32);
+ RegTy = S32;
+ } else {
+ unsigned EltSize = EltTy.getSizeInBits();
+ unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
+ unsigned RoundedSize = 32 * RoundedElts;
+ RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
+ TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
+ RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
+ }
+
+ // The return type does not need adjustment.
+ // TODO: Should we change s16 case to s32 or <2 x s16>?
+ if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
+ return true;
+
+ Register Dst1Reg;
+
+ // Insert after the instruction.
+ B.setInsertPt(*MI.getParent(), ++MI.getIterator());
+
+ // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
+ // s16> instead of s32, we would only need 1 bitcast instead of multiple.
+ const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
+ const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
+
+ Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
+
+ MI.getOperand(0).setReg(NewResultReg);
+
+ // In the IR, TFE is supposed to be used with a 2 element struct return
+ // type. The intruction really returns these two values in one contiguous
+ // register, with one additional dword beyond the loaded data. Rewrite the
+ // return type to use a single register result.
+
+ if (IsTFE) {
+ Dst1Reg = MI.getOperand(1).getReg();
+ if (MRI->getType(Dst1Reg) != S32)
+ return false;
+
+ // TODO: Make sure the TFE operand bit is set.
+ MI.RemoveOperand(1);
+
+ // Handle the easy case that requires no repack instructions.
+ if (Ty == S32) {
+ B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
+ return true;
+ }
+ }
+
+ // Now figure out how to copy the new result register back into the old
+ // result.
+ SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
+
+ const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
+
+ if (ResultNumRegs == 1) {
+ assert(!IsTFE);
+ ResultRegs[0] = NewResultReg;
+ } else {
+ // We have to repack into a new vector of some kind.
+ for (int I = 0; I != NumDataRegs; ++I)
+ ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
+ B.buildUnmerge(ResultRegs, NewResultReg);
+
+ // Drop the final TFE element to get the data part. The TFE result is
+ // directly written to the right place already.
+ if (IsTFE)
+ ResultRegs.resize(NumDataRegs);
+ }
+
+ // For an s16 scalar result, we form an s32 result with a truncate regardless
+ // of packed vs. unpacked.
+ if (IsD16 && !Ty.isVector()) {
+ B.buildTrunc(DstReg, ResultRegs[0]);
+ return true;
+ }
+
+ // Avoid a build/concat_vector of 1 entry.
+ if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
+ B.buildBitcast(DstReg, ResultRegs[0]);
+ return true;
+ }
+
+ assert(Ty.isVector());
+
+ if (IsD16) {
+ // For packed D16 results with TFE enabled, all the data components are
+ // S32. Cast back to the expected type.
+ //
+ // TODO: We don't really need to use load s32 elements. We would only need one
+ // cast for the TFE result if a multiple of v2s16 was used.
+ if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
+ for (Register &Reg : ResultRegs)
+ Reg = B.buildBitcast(V2S16, Reg).getReg(0);
+ } else if (ST.hasUnpackedD16VMem()) {
+ for (Register &Reg : ResultRegs)
+ Reg = B.buildTrunc(S16, Reg).getReg(0);
+ }
+ }
+
+ auto padWithUndef = [&](LLT Ty, int NumElts) {
+ if (NumElts == 0)
+ return;
+ Register Undef = B.buildUndef(Ty).getReg(0);
+ for (int I = 0; I != NumElts; ++I)
+ ResultRegs.push_back(Undef);
+ };
+
+ // Pad out any elements eliminated due to the dmask.
+ LLT ResTy = MRI->getType(ResultRegs[0]);
+ if (!ResTy.isVector()) {
+ padWithUndef(ResTy, NumElts - ResultRegs.size());
+ B.buildBuildVector(DstReg, ResultRegs);
+ return true;
+ }
+
+ assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
+ const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
+
+ // Deal with the one annoying legal case.
+ const LLT V3S16 = LLT::vector(3, 16);
+ if (Ty == V3S16) {
+ padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
+ auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
+ B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
+ return true;
+ }
+
+ padWithUndef(ResTy, RegsToCover - ResultRegs.size());
+ B.buildConcatVectors(DstReg, ResultRegs);
+ return true;
}
-bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeSBufferLoad(
+ MachineInstr &MI, MachineIRBuilder &B,
+ GISelChangeObserver &Observer) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT Ty = B.getMRI()->getType(Dst);
+ unsigned Size = Ty.getSizeInBits();
+ MachineFunction &MF = B.getMF();
+
+ Observer.changingInstr(MI);
+
+ // FIXME: We don't really need this intermediate instruction. The intrinsic
+ // should be fixed to have a memory operand. Since it's readnone, we're not
+ // allowed to add one.
+ MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
+ MI.RemoveOperand(1); // Remove intrinsic ID
+
+ // FIXME: When intrinsic definition is fixed, this should have an MMO already.
+ // TODO: Should this use datalayout alignment?
+ const unsigned MemSize = (Size + 7) / 8;
+ const Align MemAlign(4);
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ MemSize, MemAlign);
+ MI.addMemOperand(MF, MMO);
+
+ // There are no 96-bit result scalar loads, but widening to 128-bit should
+ // always be legal. We may need to restore this to a 96-bit result if it turns
+ // out this needs to be converted to a vector load during RegBankSelect.
+ if (!isPowerOf2_32(Size)) {
+ LegalizerHelper Helper(MF, *this, Observer, B);
+
+ if (Ty.isVector())
+ Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
+ else
+ Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
+ }
+
+ Observer.changedInstr(MI);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
+ if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !ST.isTrapHandlerEnabled()) {
+ B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ } else {
+ // Pass queue pointer to trap handler as input, and insert trap instruction
+ // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
+ const ArgDescriptor *Arg =
+ getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
+ if (!Arg)
+ return false;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register SGPR01(AMDGPU::SGPR0_SGPR1);
+ Register LiveIn = getLiveInRegister(
+ B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
+ /*InsertLiveInCopy=*/false);
+ if (!loadInputValue(LiveIn, B, Arg))
+ return false;
+ B.buildCopy(SGPR01, LiveIn);
+ B.buildInstr(AMDGPU::S_TRAP)
+ .addImm(GCNSubtarget::TrapIDLLVMTrap)
+ .addReg(SGPR01, RegState::Implicit);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ // Is non-HSA path or trap-handler disabled? then, report a warning
+ // accordingly
+ if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !ST.isTrapHandlerEnabled()) {
+ DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
+ "debugtrap handler not supported",
+ MI.getDebugLoc(), DS_Warning);
+ LLVMContext &Ctx = B.getMF().getFunction().getContext();
+ Ctx.diagnose(NoTrap);
+ } else {
+ // Insert debug-trap instruction
+ B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
MachineInstr *Br = nullptr;
- if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
+ MachineBasicBlock *UncondBrTarget = nullptr;
+ if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
- B.setInstr(*BrCond);
Register Def = MI.getOperand(1).getReg();
Register Use = MI.getOperand(3).getReg();
- MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
- if (Br)
- BrTarget = Br->getOperand(0).getMBB();
-
+ MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
+ B.setInsertPt(B.getMBB(), BrCond->getIterator());
if (IntrID == Intrinsic::amdgcn_if) {
B.buildInstr(AMDGPU::SI_IF)
.addDef(Def)
.addUse(Use)
- .addMBB(BrTarget);
+ .addMBB(UncondBrTarget);
} else {
B.buildInstr(AMDGPU::SI_ELSE)
.addDef(Def)
.addUse(Use)
- .addMBB(BrTarget)
+ .addMBB(UncondBrTarget)
.addImm(0);
}
- if (Br)
- Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
+ if (Br) {
+ Br->getOperand(0).setMBB(CondBrTarget);
+ } else {
+ // The IRTranslator skips inserting the G_BR for fallthrough cases, but
+ // since we're swapping branch targets it needs to be reinserted.
+ // FIXME: IRTranslator should probably not do this
+ B.buildBr(*CondBrTarget);
+ }
MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
@@ -2393,17 +4200,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
}
case Intrinsic::amdgcn_loop: {
MachineInstr *Br = nullptr;
- if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
+ MachineBasicBlock *UncondBrTarget = nullptr;
+ if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
- B.setInstr(*BrCond);
-
- // FIXME: Need to adjust branch targets based on unconditional branch.
+ MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
Register Reg = MI.getOperand(2).getReg();
+
+ B.setInsertPt(B.getMBB(), BrCond->getIterator());
B.buildInstr(AMDGPU::SI_LOOP)
.addUse(Reg)
- .addMBB(BrCond->getOperand(1).getMBB());
+ .addMBB(UncondBrTarget);
+
+ if (Br)
+ Br->getOperand(0).setMBB(CondBrTarget);
+ else
+ B.buildBr(*CondBrTarget);
+
MI.eraseFromParent();
BrCond->eraseFromParent();
MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
@@ -2413,6 +4227,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
return false;
}
case Intrinsic::amdgcn_kernarg_segment_ptr:
+ if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
+ // This only makes sense to call in a kernel, so just lower to null.
+ B.buildConstant(MI.getOperand(0).getReg(), 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
return legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
case Intrinsic::amdgcn_implicitarg_ptr:
@@ -2454,18 +4275,72 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
case Intrinsic::amdgcn_is_private:
return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
case Intrinsic::amdgcn_wavefrontsize: {
- B.setInstr(MI);
B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
MI.eraseFromParent();
return true;
}
+ case Intrinsic::amdgcn_s_buffer_load:
+ return legalizeSBufferLoad(MI, B, Helper.Observer);
case Intrinsic::amdgcn_raw_buffer_store:
- return legalizeRawBufferStore(MI, MRI, B, false);
+ case Intrinsic::amdgcn_struct_buffer_store:
+ return legalizeBufferStore(MI, MRI, B, false, false);
case Intrinsic::amdgcn_raw_buffer_store_format:
- return legalizeRawBufferStore(MI, MRI, B, true);
- default:
+ case Intrinsic::amdgcn_struct_buffer_store_format:
+ return legalizeBufferStore(MI, MRI, B, false, true);
+ case Intrinsic::amdgcn_raw_tbuffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_store:
+ return legalizeBufferStore(MI, MRI, B, true, true);
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load:
+ return legalizeBufferLoad(MI, MRI, B, false, false);
+ case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_struct_buffer_load_format:
+ return legalizeBufferLoad(MI, MRI, B, true, false);
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ return legalizeBufferLoad(MI, MRI, B, true, true);
+ case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ return legalizeBufferAtomic(MI, B, IntrID);
+ case Intrinsic::amdgcn_atomic_inc:
+ return legalizeAtomicIncDec(MI, B, true);
+ case Intrinsic::amdgcn_atomic_dec:
+ return legalizeAtomicIncDec(MI, B, false);
+ case Intrinsic::trap:
+ return legalizeTrapIntrinsic(MI, MRI, B);
+ case Intrinsic::debugtrap:
+ return legalizeDebugTrapIntrinsic(MI, MRI, B);
+ default: {
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrID))
+ return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
return true;
}
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1405a92787a..ce32bbf76b34f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -32,9 +32,7 @@ public:
AMDGPULegalizerInfo(const GCNSubtarget &ST,
const GCNTargetMachine &TM);
- bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B,
- GISelChangeObserver &Observer) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
Register getSegmentAperture(unsigned AddrSpace,
MachineRegisterInfo &MRI,
@@ -50,18 +48,22 @@ public:
MachineIRBuilder &B) const;
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool Signed) const;
- bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool Signed) const;
+ bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
- bool buildPCRelGlobalAddress(
- Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
- unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+ bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B,
+ const GlobalValue *GV, int64_t Offset,
+ unsigned GAFlags = SIInstrInfo::MO_NONE) const;
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -74,16 +76,50 @@ public:
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B,
+ double Log2BaseInverted) const;
+ bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
- Register getLiveInRegister(MachineRegisterInfo &MRI,
- Register Reg, LLT Ty) const;
+ bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ Register getLiveInRegister(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register PhyReg, LLT Ty,
+ bool InsertLiveInCopy = true) const;
+ Register insertLiveInCopy(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register LiveIn, Register PhyReg) const;
+ const ArgDescriptor *
+ getArgDescriptor(MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg) const;
bool legalizePreloadedArgIntrin(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
+ void legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
+ Register DstReg, Register Num, Register Den,
+ bool IsRem) const;
+ bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeSDIV_SREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
+ void legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
+ Register DstReg, Register Numer, Register Denom,
+ bool IsDiv) const;
+
+ bool legalizeUDIV_UREM64(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -102,13 +138,46 @@ public:
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, unsigned AddrSpace) const;
+ std::tuple<Register, unsigned, unsigned>
+ splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, bool IsFormat) const;
- bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const override;
+ bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool IsFormat) const;
+ Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
+ bool IsFormat) const;
+
+ bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool IsTyped,
+ bool IsFormat) const;
+ bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool IsTyped,
+ bool IsFormat) const;
+ bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
+ Intrinsic::ID IID) const;
+
+ bool legalizeImageIntrinsic(
+ MachineInstr &MI, MachineIRBuilder &B,
+ GISelChangeObserver &Observer,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
+
+ bool legalizeSBufferLoad(
+ MachineInstr &MI, MachineIRBuilder &B,
+ GISelChangeObserver &Observer) const;
+
+ bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
+ bool IsInc) const;
+
+ bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const override;
};
} // End llvm namespace.
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 0c56927dea02b..4a14259f1bdb1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -32,7 +32,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
#include <cmath>
#include <vector>
@@ -170,16 +169,13 @@ namespace {
class AMDGPUSimplifyLibCalls : public FunctionPass {
- const TargetOptions Options;
-
AMDGPULibCalls Simplifier;
public:
static char ID; // Pass identification
- AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
- const TargetMachine *TM = nullptr)
- : FunctionPass(ID), Options(Opt), Simplifier(TM) {
+ AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr)
+ : FunctionPass(ID), Simplifier(TM) {
initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
}
@@ -585,7 +581,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
auto *M = Callee->getParent();
auto &Ctx = M->getContext();
- std::string Name = Callee->getName();
+ std::string Name = std::string(Callee->getName());
auto NumArg = CI->getNumArgOperands();
if (NumArg != 4 && NumArg != 6)
return false;
@@ -594,15 +590,15 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
return false;
unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
- unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue();
- if (Size != Align || !isPowerOf2_32(Size))
+ Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue();
+ if (Alignment != Size)
return false;
Type *PtrElemTy;
if (Size <= 8)
PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
else
- PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8);
+ PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8);
unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
auto PtrArg = CI->getArgOperand(PtrArgLoc);
unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
@@ -1130,8 +1126,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
Type* rTy = opr0->getType();
Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty();
Type *nTy = nTyS;
- if (const VectorType *vTy = dyn_cast<VectorType>(rTy))
- nTy = VectorType::get(nTyS, vTy->getNumElements());
+ if (const auto *vTy = dyn_cast<FixedVectorType>(rTy))
+ nTy = FixedVectorType::get(nTyS, vTy);
unsigned size = nTy->getScalarSizeInBits();
opr_n = CI->getArgOperand(1);
if (opr_n->getType()->isIntegerTy())
@@ -1420,8 +1416,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
B.SetInsertPoint(&*ItNew);
AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
std::string(prefix) + UI->getName());
- Alloc->setAlignment(MaybeAlign(
- UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
+ Alloc->setAlignment(
+ Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
return Alloc;
}
@@ -1711,35 +1707,14 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
}
// Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
- const TargetMachine *TM) {
- return new AMDGPUSimplifyLibCalls(Opt, TM);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) {
+ return new AMDGPUSimplifyLibCalls(TM);
}
FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
return new AMDGPUUseNativeCalls();
}
-static bool setFastFlags(Function &F, const TargetOptions &Options) {
- AttrBuilder B;
-
- if (Options.UnsafeFPMath || Options.NoInfsFPMath)
- B.addAttribute("no-infs-fp-math", "true");
- if (Options.UnsafeFPMath || Options.NoNaNsFPMath)
- B.addAttribute("no-nans-fp-math", "true");
- if (Options.UnsafeFPMath) {
- B.addAttribute("less-precise-fpmad", "true");
- B.addAttribute("unsafe-fp-math", "true");
- }
-
- if (!B.hasAttributes())
- return false;
-
- F.addAttributes(AttributeList::FunctionIndex, B);
-
- return true;
-}
-
bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -1750,15 +1725,14 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
LLVM_DEBUG(dbgs() << "AMDIC: process function ";
F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
- if (!EnablePreLink)
- Changed |= setFastFlags(F, Options);
-
for (auto &BB : F) {
for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
// Ignore non-calls.
CallInst *CI = dyn_cast<CallInst>(I);
++I;
- if (!CI) continue;
+ // Ignore intrinsics that do not become real instructions.
+ if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd())
+ continue;
// Ignore indirect calls.
Function *Callee = CI->getCalledFunction();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index e1ae496d9cbca..2b5143ba7506c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -10,17 +10,18 @@
//
//===----------------------------------------------------------------------===//
-#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
-#include <llvm/ADT/SmallString.h>
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/StringSwitch.h>
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
-#include <llvm/Support/raw_ostream.h>
+#include "llvm/Support/raw_ostream.h"
#include <string>
using namespace llvm;
@@ -479,8 +480,6 @@ static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
return false;
}
-static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
-
static int eatNumber(StringRef& s) {
size_t const savedSize = s.size();
int n = 0;
@@ -605,7 +604,7 @@ bool ItaniumParamParser::parseItaniumParam(StringRef& param,
// parse type
char const TC = param.front();
- if (::isDigit(TC)) {
+ if (isDigit(TC)) {
res.ArgType = StringSwitch<AMDGPULibFunc::EType>
(eatLengthPrefixedName(param))
.Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
@@ -863,7 +862,7 @@ std::string AMDGPUMangledLibFunc::mangleNameItanium() const {
Param P;
while ((P = I.getNextParam()).ArgType != 0)
Mangler(S, P);
- return S.str();
+ return std::string(S.str());
}
///////////////////////////////////////////////////////////////////////////////
@@ -903,7 +902,7 @@ static Type* getIntrinsicParamType(
return nullptr;
}
if (P.VectorSize > 1)
- T = VectorType::get(T, P.VectorSize);
+ T = FixedVectorType::get(T, P.VectorSize);
if (P.PtrKind != AMDGPULibFunc::BYVALUE)
T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
- 1)
@@ -936,7 +935,7 @@ std::string AMDGPUMangledLibFunc::getName() const {
SmallString<128> Buf;
raw_svector_ostream OS(Buf);
writeName(OS);
- return OS.str();
+ return std::string(OS.str());
}
Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index 2354ed7df2059..c97223b047e88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -13,6 +13,7 @@
namespace llvm {
+class FunctionCallee;
class FunctionType;
class Function;
class Module;
@@ -341,7 +342,7 @@ public:
/// and unmangled function name for unmangled library functions.
virtual std::string mangle() const = 0;
- void setName(StringRef N) { Name = N; }
+ void setName(StringRef N) { Name = std::string(N); }
void setPrefix(ENamePrefix pfx) { FKind = pfx; }
virtual FunctionType *getFunctionType(Module &M) const = 0;
@@ -438,7 +439,7 @@ class AMDGPUUnmangledLibFunc : public AMDGPULibFuncImpl {
public:
explicit AMDGPUUnmangledLibFunc();
explicit AMDGPUUnmangledLibFunc(StringRef FName, FunctionType *FT) {
- Name = FName;
+ Name = std::string(FName);
FuncTy = FT;
}
std::string getName() const override { return Name; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 15032969890e2..54c15e4e4d397 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -22,7 +22,15 @@ using namespace llvm;
namespace {
-const unsigned MaxStaticSize = 1024;
+static int MaxStaticSize;
+
+static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
+ "amdgpu-mem-intrinsic-expand-size",
+ cl::desc("Set minimum mem intrinsic size to expand in IR"),
+ cl::location(MaxStaticSize),
+ cl::init(1024),
+ cl::Hidden);
+
class AMDGPULowerIntrinsics : public ModulePass {
private:
@@ -57,7 +65,7 @@ INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
// require splitting based on alignment)
static bool shouldExpandOperationWithSize(Value *Size) {
ConstantInt *CI = dyn_cast<ConstantInt>(Size);
- return !CI || (CI->getZExtValue() > MaxStaticSize);
+ return !CI || (CI->getSExtValue() > MaxStaticSize);
}
bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index e64542a395f0e..62ab5bb55a16a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -58,6 +58,21 @@ public:
} // end anonymous namespace
+// skip allocas
+static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
+ BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
+ for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
+ AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
+
+ // If this is a dynamic alloca, the value may depend on the loaded kernargs,
+ // so loads will need to be inserted before it.
+ if (!AI || !AI->isStaticAlloca())
+ break;
+ }
+
+ return InsPt;
+}
+
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
@@ -70,7 +85,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
LLVMContext &Ctx = F.getParent()->getContext();
const DataLayout &DL = F.getParent()->getDataLayout();
BasicBlock &EntryBlock = *F.begin();
- IRBuilder<> Builder(&*EntryBlock.begin());
+ IRBuilder<> Builder(&*getInsertPt(EntryBlock));
const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
@@ -94,7 +109,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
for (Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy);
+ Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
unsigned Size = DL.getTypeSizeInBits(ArgTy);
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
@@ -120,7 +135,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
continue;
}
- VectorType *VT = dyn_cast<VectorType>(ArgTy);
+ auto *VT = dyn_cast<FixedVectorType>(ArgTy);
bool IsV3 = VT && VT->getNumElements() == 3;
bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
@@ -152,7 +167,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
}
if (IsV3 && Size >= 32) {
- V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+ V4Ty = FixedVectorType::get(VT->getElementType(), 4);
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
AdjustedArgTy = V4Ty;
}
@@ -160,7 +175,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
ArgPtr->getName() + ".cast");
LoadInst *Load =
- Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value());
+ Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
@@ -210,7 +225,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
Arg.replaceAllUsesWith(NewVal);
} else if (IsV3) {
Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
- {0, 1, 2},
+ ArrayRef<int>{0, 1, 2},
Arg.getName() + ".load");
Arg.replaceAllUsesWith(Shuf);
} else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ce7286dabcc8a..99d229c9b74ee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -254,7 +254,7 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
return AsmPrinter::lowerConstant(CV);
}
-void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
@@ -272,7 +272,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MachineBasicBlock *MBB = MI->getParent();
MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
while (I != MBB->instr_end() && I->isInsideBundle()) {
- EmitInstruction(&*I);
+ emitInstruction(&*I);
++I;
}
} else {
@@ -381,7 +381,7 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
}
}
-void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
R600MCInstLower MCInstLowering(OutContext, STI, *this);
@@ -396,7 +396,7 @@ void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MachineBasicBlock *MBB = MI->getParent();
MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
while (I != MBB->instr_end() && I->isInsideBundle()) {
- EmitInstruction(&*I);
+ emitInstruction(&*I);
++I;
}
} else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 940ddff85d73f..64acd6efe0280 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -15,14 +15,9 @@ using namespace llvm;
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
- LocalMemoryObjects(),
- ExplicitKernArgSize(0),
- LDSSize(0),
- Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()),
+ Mode(MF.getFunction()),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
- NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
- MemoryBound(false),
- WaveLimiter(false) {
+ NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
@@ -43,19 +38,18 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
- const GlobalValue &GV) {
+ const GlobalVariable &GV) {
auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
if (!Entry.second)
return Entry.first->second;
- unsigned Align = GV.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV.getValueType());
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
/// TODO: We should sort these to minimize wasted space due to alignment
/// padding. Currently the padding is decided by the first encountered use
/// during lowering.
- unsigned Offset = LDSSize = alignTo(LDSSize, Align);
+ unsigned Offset = LDSSize = alignTo(LDSSize, Alignment);
Entry.first->second = Offset;
LDSSize += DL.getTypeAllocSize(GV.getValueType());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 1933e41c66f36..c504dd76bc658 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -23,26 +23,26 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
protected:
- uint64_t ExplicitKernArgSize; // Cache for this.
+ uint64_t ExplicitKernArgSize = 0; // Cache for this.
Align MaxKernArgAlign; // Cache for this.
/// Number of bytes in the LDS that are being used.
- unsigned LDSSize;
+ unsigned LDSSize = 0;
// State of MODE register, assumed FP mode.
AMDGPU::SIModeRegisterDefaults Mode;
// Kernels + shaders. i.e. functions called by the driver and not called
// by other functions.
- bool IsEntryFunction;
+ bool IsEntryFunction = false;
- bool NoSignedZerosFPMath;
+ bool NoSignedZerosFPMath = false;
// Function may be memory bound.
- bool MemoryBound;
+ bool MemoryBound = false;
// Kernel may need limited waves per EU for better performance.
- bool WaveLimiter;
+ bool WaveLimiter = false;
public:
AMDGPUMachineFunction(const MachineFunction &MF);
@@ -77,7 +77,7 @@ public:
return WaveLimiter;
}
- unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
+ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 8c11230f411a9..b05855d1afc64 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -34,6 +34,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
switch (SecondMI.getOpcode()) {
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_SUBB_U32_e64:
+ case AMDGPU::V_SUBBREV_U32_e64:
case AMDGPU::V_CNDMASK_B32_e64: {
// Try to cluster defs of condition registers to their uses. This improves
// the chance VCC will be available which will allow shrinking to VOP2
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index f7231471c1077..4f9ffa11bc73b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -33,6 +33,7 @@
#include "AMDGPU.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 9613d5a843b38..93079738ef990 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -28,6 +28,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -220,9 +221,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
++FI.InstCount;
continue;
}
- CallSite CS(const_cast<Instruction *>(&I));
- if (CS) {
- Function *Callee = CS.getCalledFunction();
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ Function *Callee = CB->getCalledFunction();
if (!Callee || Callee->isDeclaration()) {
++FI.InstCount;
continue;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
new file mode 100644
index 0000000000000..098b0e9938861
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -0,0 +1,359 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+struct FMinFMaxLegacyInfo {
+ Register LHS;
+ Register RHS;
+ Register True;
+ Register False;
+ CmpInst::Predicate Pred;
+};
+
+// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
+static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
+ // FIXME: Combines should have subtarget predicates, and we shouldn't need
+ // this here.
+ if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
+ return false;
+
+ // FIXME: Type predicate on pattern
+ if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
+ return false;
+
+ Register Cond = MI.getOperand(1).getReg();
+ if (!MRI.hasOneNonDBGUse(Cond) ||
+ !mi_match(Cond, MRI,
+ m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
+ return false;
+
+ Info.True = MI.getOperand(2).getReg();
+ Info.False = MI.getOperand(3).getReg();
+
+ if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
+ !(Info.LHS == Info.False && Info.RHS == Info.True))
+ return false;
+
+ switch (Info.Pred) {
+ case CmpInst::FCMP_FALSE:
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_ORD:
+ case CmpInst::FCMP_UNO:
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_UNE:
+ case CmpInst::FCMP_TRUE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
+ const FMinFMaxLegacyInfo &Info) {
+
+ auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
+ MachineIRBuilder MIB(MI);
+ MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
+ };
+
+ switch (Info.Pred) {
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
+ break;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_OLT: {
+ // We need to permute the operands to get the correct NaN behavior. The
+ // selected operand is the second one based on the failing compare with NaN,
+ // so permute it based on the compare type the hardware uses.
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
+ break;
+ }
+ case CmpInst::FCMP_UGE:
+ case CmpInst::FCMP_UGT: {
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
+ break;
+ }
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE: {
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
+ break;
+ }
+ default:
+ llvm_unreachable("predicate should not have matched");
+ }
+
+ MI.eraseFromParent();
+}
+
+static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineFunction &MF, CombinerHelper &Helper) {
+ Register DstReg = MI.getOperand(0).getReg();
+
+ // TODO: We could try to match extracting the higher bytes, which would be
+ // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+ // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+ // about in practice.
+ LLT Ty = MRI.getType(DstReg);
+ if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
+ Register SrcReg = MI.getOperand(1).getReg();
+ unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
+ assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
+ const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
+ return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
+ }
+
+ return false;
+}
+
+static void applyUCharToFloat(MachineInstr &MI) {
+ MachineIRBuilder B(MI);
+
+ const LLT S32 = LLT::scalar(32);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT Ty = B.getMRI()->getType(DstReg);
+ LLT SrcTy = B.getMRI()->getType(SrcReg);
+ if (SrcTy != S32)
+ SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
+
+ if (Ty == S32) {
+ B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
+ {SrcReg}, MI.getFlags());
+ } else {
+ auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
+ {SrcReg}, MI.getFlags());
+ B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
+ }
+
+ MI.eraseFromParent();
+}
+
+// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
+// boilerplate.
+struct CvtF32UByteMatchInfo {
+ Register CvtVal;
+ unsigned ShiftOffset;
+};
+
+static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineFunction &MF,
+ CvtF32UByteMatchInfo &MatchInfo) {
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ // Look through G_ZEXT.
+ mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
+
+ Register Src0;
+ int64_t ShiftAmt;
+ bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
+ if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
+ const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
+
+ unsigned ShiftOffset = 8 * Offset;
+ if (IsShr)
+ ShiftOffset += ShiftAmt;
+ else
+ ShiftOffset -= ShiftAmt;
+
+ MatchInfo.CvtVal = Src0;
+ MatchInfo.ShiftOffset = ShiftOffset;
+ return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
+ }
+
+ // TODO: Simplify demanded bits.
+ return false;
+}
+
+static void applyCvtF32UByteN(MachineInstr &MI,
+ const CvtF32UByteMatchInfo &MatchInfo) {
+ MachineIRBuilder B(MI);
+ unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
+
+ const LLT S32 = LLT::scalar(32);
+ Register CvtSrc = MatchInfo.CvtVal;
+ LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
+ if (SrcTy != S32) {
+ assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
+ CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
+ }
+
+ assert(MI.getOpcode() != NewOpc);
+ B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
+ MI.eraseFromParent();
+}
+
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
+public:
+ AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+ AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ const AMDGPULegalizerInfo *LI,
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
+ : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+ /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!GeneratedRuleCfg.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B, KB, MDT);
+ AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+
+ if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ return true;
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
+ return Helper.tryCombineShiftToUnmerge(MI, 32);
+ }
+
+ return false;
+}
+
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
+
+ StringRef getPassName() const override {
+ return "AMDGPUPostLegalizerCombiner";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+ bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const AMDGPULegalizerInfo *LI
+ = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), LI, KB, MDT);
+ Combiner C(PCInfo, TPC);
+ return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPUPostLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs after legalization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs after legalization", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
+ return new AMDGPUPostLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
new file mode 100644
index 0000000000000..800ad2039f0e9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -0,0 +1,153 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// before the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
+public:
+ AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+ AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
+ : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+ /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!GeneratedRuleCfg.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B, KB, MDT);
+ AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+
+ if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ return true;
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return Helper.tryCombineConcatVectors(MI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return Helper.tryCombineShuffleVector(MI);
+ }
+
+ return false;
+}
+
+#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
+
+ StringRef getPassName() const override {
+ return "AMDGPUPreLegalizerCombiner";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+ bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), KB, MDT);
+ Combiner C(PCInfo, TPC);
+ return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPUPreLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs before legalization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs before legalization", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
+ return new AMDGPUPreLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 511de96b5f7cb..524a34be876ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -218,10 +218,10 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
//
if (ArgSize % DWORD_ALIGN != 0) {
llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx);
- VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType);
+ auto *LLVMVecType = llvm::dyn_cast<llvm::FixedVectorType>(ArgType);
int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1;
if (LLVMVecType && NumElem > 1)
- ResType = llvm::VectorType::get(ResType, NumElem);
+ ResType = llvm::FixedVectorType::get(ResType, NumElem);
Builder.SetInsertPoint(CI);
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
if (OpConvSpecifiers[ArgCount - 1] == 'x' ||
@@ -387,9 +387,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
Value *id_gep_cast =
new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
- StoreInst *stbuff =
- new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast);
- stbuff->insertBefore(Brnch); // to Remove unused variable warning
+ new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
SmallVector<Value *, 2> FourthIdxList;
ConstantInt *fourInt =
@@ -408,8 +406,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
Value *Arg = CI->getArgOperand(ArgCount);
Type *ArgType = Arg->getType();
SmallVector<Value *, 32> WhatToStore;
- if (ArgType->isFPOrFPVectorTy() &&
- (ArgType->getTypeID() != Type::VectorTyID)) {
+ if (ArgType->isFPOrFPVectorTy() && !isa<VectorType>(ArgType)) {
Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
if (OpConvSpecifiers[ArgCount - 1] == 'f') {
ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
@@ -478,18 +475,14 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch);
WhatToStore.push_back(Arg);
}
- } else if (ArgType->getTypeID() == Type::VectorTyID) {
+ } else if (isa<FixedVectorType>(ArgType)) {
Type *IType = NULL;
- uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements();
+ uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements();
uint32_t EleSize = ArgType->getScalarSizeInBits();
uint32_t TotalSize = EleCount * EleSize;
if (EleCount == 3) {
- IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext());
- Constant *Indices[4] = {
- ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1),
- ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)};
- Constant *Mask = ConstantVector::get(Indices);
- ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask);
+ ShuffleVectorInst *Shuffle =
+ new ShuffleVectorInst(Arg, Arg, ArrayRef<int>{0, 1, 2, 2});
Shuffle->insertBefore(Brnch);
Arg = Shuffle;
ArgType = Arg->getType();
@@ -523,7 +516,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
break;
}
if (EleCount > 1) {
- IType = dyn_cast<Type>(VectorType::get(IType, EleCount));
+ IType = FixedVectorType::get(IType, EleCount);
}
Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch);
WhatToStore.push_back(Arg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 14958a180ce3f..727f71b350490 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
cl::desc("Disable promote alloca to LDS"),
cl::init(false));
+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+ "amdgpu-promote-alloca-to-vector-limit",
+ cl::desc("Maximum byte size to consider promote alloca to vector"),
+ cl::init(0));
+
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
@@ -86,6 +91,7 @@ private:
// FIXME: This should be per-kernel.
uint32_t LocalMemLimit = 0;
uint32_t CurrentLocalMemUsage = 0;
+ unsigned MaxVGPRs;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -128,14 +134,42 @@ public:
}
};
+class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+ unsigned MaxVGPRs;
+
+public:
+ static char ID;
+
+ AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Promote Alloca to vector";
+ }
+
+ bool handleAlloca(AllocaInst &I);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // end anonymous namespace
char AMDGPUPromoteAlloca::ID = 0;
+char AMDGPUPromoteAllocaToVector::ID = 0;
INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
+
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
@@ -161,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
if (!ST.isPromoteAllocaEnabled())
return false;
+ if (IsAMDGCN) {
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ } else {
+ MaxVGPRs = 128;
+ }
+
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@@ -251,10 +292,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
// 32-bit and extract sequence is already present, and it is probably easier
// to CSE this. The loads should be mergable later anyway.
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
- LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
+ LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
- LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
+ LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
MDNode *MD = MDNode::get(Mod->getContext(), None);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
@@ -297,15 +338,26 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
return CI;
}
-static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
- return VectorType::get(ArrayTy->getElementType(),
- ArrayTy->getNumElements());
+static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
+ return FixedVectorType::get(ArrayTy->getElementType(),
+ ArrayTy->getNumElements());
+}
+
+static Value *stripBitcasts(Value *V) {
+ while (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (I->getOpcode() != Instruction::BitCast)
+ break;
+ V = I->getOperand(0);
+ }
+ return V;
}
static Value *
calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
- GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+ if (!GEP)
+ return nullptr;
auto I = GEPIdx.find(GEP);
return I == GEPIdx.end() ? nullptr : I->second;
@@ -327,7 +379,8 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
//
// TODO: Check isTriviallyVectorizable for calls and handle other
// instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User) {
+static bool canVectorizeInst(Instruction *Inst, User *User,
+ const DataLayout &DL) {
switch (Inst->getOpcode()) {
case Instruction::Load: {
// Currently only handle the case where the Pointer Operand is a GEP.
@@ -337,7 +390,14 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
LI->getPointerOperandType() == User->getType() &&
isa<VectorType>(LI->getType()))
return true;
- return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
+
+ Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
+ if (!PtrInst)
+ return false;
+
+ return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
+ PtrInst->getOpcode() == Instruction::BitCast) &&
+ LI->isSimple();
}
case Instruction::BitCast:
return true;
@@ -350,22 +410,46 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
SI->getPointerOperandType() == User->getType() &&
isa<VectorType>(SI->getValueOperand()->getType()))
return true;
- return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
+
+ Instruction *UserInst = dyn_cast<Instruction>(User);
+ if (!UserInst)
+ return false;
+
+ return (SI->getPointerOperand() == User) &&
+ (UserInst->getOpcode() == Instruction::GetElementPtr ||
+ UserInst->getOpcode() == Instruction::BitCast) &&
+ SI->isSimple();
}
default:
return false;
}
}
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+ unsigned MaxVGPRs) {
if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
return false;
}
- Type *AT = Alloca->getAllocatedType();
- SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
+ Type *AllocaTy = Alloca->getAllocatedType();
+ auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+ if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+ if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
+ ArrayTy->getNumElements() > 0)
+ VectorTy = arrayTypeToVecType(ArrayTy);
+ }
+
+ // Use up to 1/4 of available register budget for vectorization.
+ unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+ : (MaxVGPRs * 32);
+
+ if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
+ << MaxVGPRs << " registers available\n");
+ return false;
+ }
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
@@ -373,22 +457,44 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
// could also be promoted but we don't currently handle this case
- if (!AllocaTy ||
- AllocaTy->getNumElements() > 16 ||
- AllocaTy->getNumElements() < 2 ||
- !VectorType::isValidElementType(AllocaTy->getElementType())) {
+ if (!VectorTy || VectorTy->getNumElements() > 16 ||
+ VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
- std::vector<Value*> WorkList;
- for (User *AllocaUser : Alloca->users()) {
+ std::vector<Value *> WorkList;
+ SmallVector<User *, 8> Users(Alloca->users());
+ SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+ Type *VecEltTy = VectorTy->getElementType();
+ while (!Users.empty()) {
+ User *AllocaUser = Users.pop_back_val();
+ User *UseUser = UseUsers.pop_back_val();
+ Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
+
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
if (!GEP) {
- if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+ if (!canVectorizeInst(Inst, UseUser, DL))
return false;
+ if (Inst->getOpcode() == Instruction::BitCast) {
+ Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
+ Type *ToTy = Inst->getType()->getPointerElementType();
+ if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
+ DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
+ continue;
+
+ for (User *CastUser : Inst->users()) {
+ if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
+ continue;
+ Users.push_back(CastUser);
+ UseUsers.push_back(Inst);
+ }
+
+ continue;
+ }
+
WorkList.push_back(AllocaUser);
continue;
}
@@ -404,18 +510,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
}
GEPVectorIdx[GEP] = Index;
- for (User *GEPUser : AllocaUser->users()) {
- if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
- return false;
-
- WorkList.push_back(GEPUser);
- }
+ Users.append(GEP->user_begin(), GEP->user_end());
+ UseUsers.append(GEP->getNumUses(), GEP);
}
- VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
- if (!VectorTy)
- VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
-
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
@@ -424,40 +522,46 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
- if (Inst->getType() == AT)
+ if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
break;
- Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ if (!Index)
+ break;
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+ if (Inst->getType() != VecEltTy)
+ ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();
break;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(Inst);
- if (SI->getValueOperand()->getType() == AT)
+ if (SI->getValueOperand()->getType() == AllocaTy ||
+ SI->getValueOperand()->getType()->isVectorTy())
break;
- Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ if (!Index)
+ break;
+
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
- Value *NewVecValue = Builder.CreateInsertElement(VecValue,
- SI->getValueOperand(),
- Index);
+ Value *Elt = SI->getValueOperand();
+ if (Elt->getType() != VecEltTy)
+ Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+ Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
Builder.CreateStore(NewVecValue, BitCast);
Inst->eraseFromParent();
break;
}
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
- break;
default:
llvm_unreachable("Inconsistency in instructions promotable to vector");
@@ -659,16 +763,15 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
continue;
if (Use->getParent()->getParent() == &F) {
- unsigned Align = GV.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV.getValueType());
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
// FIXME: Try to account for padding here. The padding is currently
// determined from the inverse order of uses in the function. I'm not
// sure if the use list order is in any way connected to this, so the
// total reported size is likely incorrect.
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
- CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
CurrentLocalMemUsage += AllocSize;
break;
}
@@ -722,6 +825,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (!I.isStaticAlloca() || I.isArrayAllocation())
return false;
+ const DataLayout &DL = Mod->getDataLayout();
IRBuilder<> Builder(&I);
// First try to replace the alloca with a vector
@@ -729,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
- if (tryPromoteAllocaToVector(&I))
+ if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
return true; // Promoted to vector.
if (DisablePromoteAllocaToLDS)
@@ -759,11 +863,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
- const DataLayout &DL = Mod->getDataLayout();
-
- unsigned Align = I.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(I.getAllocatedType());
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
// FIXME: This computed padding is likely wrong since it depends on inverse
// usage order.
@@ -771,7 +872,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// FIXME: It is also possible that if we're allowed to use all of the memory
// could could end up using more than the maximum due to alignment padding.
- uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+ uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
NewSize += AllocSize;
@@ -938,6 +1039,60 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
return true;
}
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+ if (skipFunction(F) || DisablePromoteAllocaToVector)
+ return false;
+
+ const TargetMachine *TM;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ TM = &TPC->getTM<TargetMachine>();
+ else
+ return false;
+
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ if (!ST.isPromoteAllocaEnabled())
+ return false;
+
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ } else {
+ MaxVGPRs = 128;
+ }
+
+ bool Changed = false;
+ BasicBlock &EntryBB = *F.begin();
+
+ SmallVector<AllocaInst *, 16> Allocas;
+ for (Instruction &I : EntryBB) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Allocas.push_back(AI);
+ }
+
+ for (AllocaInst *AI : Allocas) {
+ if (handleAlloca(*AI))
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
+ // Array allocations are probably not worth handling, since an allocation of
+ // the array type is the canonical form.
+ if (!I.isStaticAlloca() || I.isArrayAllocation())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+ Module *Mod = I.getParent()->getParent()->getParent();
+ return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
return new AMDGPUPromoteAlloca();
}
+
+FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
+ return new AMDGPUPromoteAllocaToVector();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index 7a7addd0f5cfe..982aae3748849 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -48,19 +48,62 @@ extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
namespace {
+// Target features to propagate.
+static constexpr const FeatureBitset TargetFeatures = {
+ AMDGPU::FeatureWavefrontSize16,
+ AMDGPU::FeatureWavefrontSize32,
+ AMDGPU::FeatureWavefrontSize64
+};
+
+// Attributes to propagate.
+static constexpr const char* AttributeNames[] = {
+ "amdgpu-waves-per-eu"
+};
+
+static constexpr unsigned NumAttr =
+ sizeof(AttributeNames) / sizeof(AttributeNames[0]);
+
class AMDGPUPropagateAttributes {
- const FeatureBitset TargetFeatures = {
- AMDGPU::FeatureWavefrontSize16,
- AMDGPU::FeatureWavefrontSize32,
- AMDGPU::FeatureWavefrontSize64
+
+ class FnProperties {
+ private:
+ explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {}
+
+ public:
+ explicit FnProperties(const TargetMachine &TM, const Function &F) {
+ Features = TM.getSubtargetImpl(F)->getFeatureBits();
+
+ for (unsigned I = 0; I < NumAttr; ++I)
+ if (F.hasFnAttribute(AttributeNames[I]))
+ Attributes[I] = F.getFnAttribute(AttributeNames[I]);
+ }
+
+ bool operator == (const FnProperties &Other) const {
+ if ((Features & TargetFeatures) != (Other.Features & TargetFeatures))
+ return false;
+ for (unsigned I = 0; I < NumAttr; ++I)
+ if (Attributes[I] != Other.Attributes[I])
+ return false;
+ return true;
+ }
+
+ FnProperties adjustToCaller(const FnProperties &CallerProps) const {
+ FnProperties New((Features & ~TargetFeatures) | CallerProps.Features);
+ for (unsigned I = 0; I < NumAttr; ++I)
+ New.Attributes[I] = CallerProps.Attributes[I];
+ return New;
+ }
+
+ FeatureBitset Features;
+ Optional<Attribute> Attributes[NumAttr];
};
- class Clone{
+ class Clone {
public:
- Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
- FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}
+ Clone(const FnProperties &Props, Function *OrigF, Function *NewF) :
+ Properties(Props), OrigF(OrigF), NewF(NewF) {}
- FeatureBitset FeatureMask;
+ FnProperties Properties;
Function *OrigF;
Function *NewF;
};
@@ -77,17 +120,19 @@ class AMDGPUPropagateAttributes {
SmallVector<Clone, 32> Clones;
// Find a clone with required features.
- Function *findFunction(const FeatureBitset &FeaturesNeeded,
+ Function *findFunction(const FnProperties &PropsNeeded,
Function *OrigF);
- // Clone function F and set NewFeatures on the clone.
+ // Clone function \p F and set \p NewProps on the clone.
// Cole takes the name of original function.
- Function *cloneWithFeatures(Function &F,
- const FeatureBitset &NewFeatures);
+ Function *cloneWithProperties(Function &F, const FnProperties &NewProps);
// Set new function's features in place.
void setFeatures(Function &F, const FeatureBitset &NewFeatures);
+ // Set new function's attributes in place.
+ void setAttributes(Function &F, const ArrayRef<Optional<Attribute>> NewAttrs);
+
std::string getFeatureString(const FeatureBitset &Features) const;
// Propagate attributes from Roots.
@@ -155,11 +200,11 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
false, false)
Function *
-AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
+AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
Function *OrigF) {
// TODO: search for clone's clones.
for (Clone &C : Clones)
- if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
+ if (C.OrigF == OrigF && PropsNeeded == C.Properties)
return C.NewF;
return nullptr;
@@ -192,12 +237,12 @@ bool AMDGPUPropagateAttributes::process() {
NewRoots.clear();
for (auto &F : M.functions()) {
- if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
+ if (F.isDeclaration())
continue;
- const FeatureBitset &CalleeBits =
- TM->getSubtargetImpl(F)->getFeatureBits();
+ const FnProperties CalleeProps(*TM, F);
SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
+ SmallSet<CallBase *, 32> Visited;
for (User *U : F.users()) {
Instruction *I = dyn_cast<Instruction>(U);
@@ -207,36 +252,36 @@ bool AMDGPUPropagateAttributes::process() {
if (!CI)
continue;
Function *Caller = CI->getCaller();
- if (!Caller)
+ if (!Caller || !Visited.insert(CI).second)
continue;
- if (!Roots.count(Caller))
+ if (!Roots.count(Caller) && !NewRoots.count(Caller))
continue;
- const FeatureBitset &CallerBits =
- TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;
+ const FnProperties CallerProps(*TM, *Caller);
- if (CallerBits == (CalleeBits & TargetFeatures)) {
- NewRoots.insert(&F);
+ if (CalleeProps == CallerProps) {
+ if (!Roots.count(&F))
+ NewRoots.insert(&F);
continue;
}
- Function *NewF = findFunction(CallerBits, &F);
+ Function *NewF = findFunction(CallerProps, &F);
if (!NewF) {
- FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
- CallerBits);
+ const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps);
if (!AllowClone) {
// This may set different features on different iteartions if
// there is a contradiction in callers' attributes. In this case
// we rely on a second pass running on Module, which is allowed
// to clone.
- setFeatures(F, NewFeatures);
+ setFeatures(F, NewProps.Features);
+ setAttributes(F, NewProps.Attributes);
NewRoots.insert(&F);
Changed = true;
break;
}
- NewF = cloneWithFeatures(F, NewFeatures);
- Clones.push_back(Clone(CallerBits, &F, NewF));
+ NewF = cloneWithProperties(F, NewProps);
+ Clones.push_back(Clone(CallerProps, &F, NewF));
NewRoots.insert(NewF);
}
@@ -258,28 +303,30 @@ bool AMDGPUPropagateAttributes::process() {
F->eraseFromParent();
}
+ Roots.clear();
+ Clones.clear();
+
return Changed;
}
Function *
-AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
- const FeatureBitset &NewFeatures) {
+AMDGPUPropagateAttributes::cloneWithProperties(Function &F,
+ const FnProperties &NewProps) {
LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
ValueToValueMapTy dummy;
Function *NewF = CloneFunction(&F, dummy);
- setFeatures(*NewF, NewFeatures);
+ setFeatures(*NewF, NewProps.Features);
+ setAttributes(*NewF, NewProps.Attributes);
+ NewF->setVisibility(GlobalValue::DefaultVisibility);
+ NewF->setLinkage(GlobalValue::InternalLinkage);
// Swap names. If that is the only clone it will retain the name of now
- // dead value.
- if (F.hasName()) {
- std::string NewName = NewF->getName();
+ // dead value. Preserve original name for externally visible functions.
+ if (F.hasName() && F.hasLocalLinkage()) {
+ std::string NewName = std::string(NewF->getName());
NewF->takeName(&F);
F.setName(NewName);
-
- // Name has changed, it does not need an external symbol.
- F.setVisibility(GlobalValue::DefaultVisibility);
- F.setLinkage(GlobalValue::InternalLinkage);
}
return NewF;
@@ -297,6 +344,18 @@ void AMDGPUPropagateAttributes::setFeatures(Function &F,
F.addFnAttr("target-features", NewFeatureStr);
}
+void AMDGPUPropagateAttributes::setAttributes(Function &F,
+ const ArrayRef<Optional<Attribute>> NewAttrs) {
+ LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n");
+ for (unsigned I = 0; I < NumAttr; ++I) {
+ F.removeFnAttr(AttributeNames[I]);
+ if (NewAttrs[I]) {
+ LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n');
+ F.addFnAttr(*NewAttrs[I]);
+ }
+ }
+}
+
std::string
AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
{
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
new file mode 100644
index 0000000000000..71d82679b3ff1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -0,0 +1,154 @@
+//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after register banks are known.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-regbank-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPURegBankCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
+public:
+ AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg;
+
+ AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ const AMDGPULegalizerInfo *LI,
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
+ : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+ /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!GeneratedRuleCfg.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B, KB, MDT);
+ AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg);
+
+ if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ return true;
+
+ return false;
+}
+
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPURegBankCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPURegBankCombiner(bool IsOptNone = false);
+
+ StringRef getPassName() const override {
+ return "AMDGPURegBankCombiner";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+ bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const AMDGPULegalizerInfo *LI
+ = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), LI, KB, MDT);
+ Combiner C(PCInfo, TPC);
+ return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPURegBankCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs after regbankselect",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs after regbankselect", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) {
+ return new AMDGPURegBankCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1bb01dc8fa112..dfaf97bfb08e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -8,10 +8,69 @@
/// \file
/// This file implements the targeting of the RegisterBankInfo class for
/// AMDGPU.
-/// \todo This should be generated by TableGen.
+///
+/// \par
+///
+/// AMDGPU has unique register bank constraints that require special high level
+/// strategies to deal with. There are two main true physical register banks
+/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
+/// sort of pseudo-register bank needed to represent SGPRs used in a vector
+/// boolean context. There is also the AGPR bank, which is a special purpose
+/// physical register bank present on some subtargets.
+///
+/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
+/// be uniform. It is generally not valid to legalize operands by inserting
+/// copies as on other targets. Operations which require uniform, SGPR operands
+/// generally require scalarization by repeatedly executing the instruction,
+/// activating each set of lanes using a unique set of input values. This is
+/// referred to as a waterfall loop.
+///
+/// \par Booleans
+///
+/// Booleans (s1 values) requires special consideration. A vector compare result
+/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
+/// register. These are represented with the VCC bank. During selection, we need
+/// to be able to unambiguously go back from a register class to a register
+/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
+/// bank, we need to know the use context type. An SGPR s1 value always means a
+/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
+/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
+/// a 32-bit virtual register. Taken together, this means we need to adjust the
+/// type of boolean operations to be regbank legal. All SALU booleans need to be
+/// widened to 32-bits, and all VALU booleans need to be s1 values.
+///
+/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
+/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
+/// bank. A non-boolean source (such as a truncate from a 1-bit load from
+/// memory) will require a copy to the VCC bank which will require clearing the
+/// high bits and inserting a compare.
+///
+/// \par Constant bus restriction
+///
+/// VALU instructions have a limitation known as the constant bus
+/// restriction. Most VALU instructions can use SGPR operands, but may read at
+/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
+/// instructions). This is one unique SGPR, so the same SGPR may be used for
+/// multiple operands. From a register bank perspective, any combination of
+/// operands should be legal as an SGPR, but this is contextually dependent on
+/// the SGPR operands all being the same register. There is therefore optimal to
+/// choose the SGPR with the most uses to minimize the number of copies.
+///
+/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
+/// operation should have its source operands all mapped to VGPRs (except for
+/// VCC), inserting copies from any SGPR operands. This the most trival legal
+/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
+/// complicated to solve here. Every optimization pattern or instruction
+/// selected to multiple outputs would have to enforce this rule, and there
+/// would be additional complexity in tracking this rule for every G_*
+/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
+/// picking the optimal operand combination from a post-isel optimization pass.
+///
//===----------------------------------------------------------------------===//
#include "AMDGPURegisterBankInfo.h"
+
+#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -19,8 +78,8 @@
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -101,8 +160,9 @@ public:
if (!Op.isReg())
continue;
+ // We may see physical registers if building a real MI
Register Reg = Op.getReg();
- if (MRI.getRegClassOrRegBank(Reg))
+ if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
continue;
const RegisterBank *RB = NewBank;
@@ -138,15 +198,16 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
TII(Subtarget.getInstrInfo()) {
// HACK: Until this is fully tablegen'd.
- static bool AlreadyInit = false;
- if (AlreadyInit)
- return;
+ static llvm::once_flag InitializeRegisterBankFlag;
- AlreadyInit = true;
+ static auto InitializeRegisterBankOnce = [this]() {
+ assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
+ &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
+ &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
+ (void)this;
+ };
- assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
- &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
- &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
+ llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
}
static bool isVectorRegisterBank(const RegisterBank &Bank) {
@@ -159,7 +220,7 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
unsigned Size) const {
// TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
- isVectorRegisterBank(Src)) {
+ (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
return std::numeric_limits<unsigned>::max();
}
@@ -177,9 +238,6 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
- if (Src.getID() == AMDGPU::VCCRegBankID)
- return std::numeric_limits<unsigned>::max();
-
// There is no direct copy between AGPRs.
if (Dst.getID() == AMDGPU::AGPRRegBankID &&
Src.getID() == AMDGPU::AGPRRegBankID)
@@ -317,22 +375,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
switch (MI.getIntrinsicID()) {
- case Intrinsic::amdgcn_buffer_load: {
- static const OpRegBankEntry<3> Table[4] = {
- // Perfectly legal.
- { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
- { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-
- // Waterfall loop needed for rsrc. In the worst case this will execute
- // approximately an extra 10 * wavesize + 2 instructions.
- { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
- { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
- };
-
- // rsrc, voffset, offset
- const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
- return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
- }
case Intrinsic::amdgcn_s_buffer_load: {
static const OpRegBankEntry<2> Table[4] = {
// Perfectly legal.
@@ -402,15 +444,15 @@ static bool isScalarLoadLegal(const MachineInstr &MI) {
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
// There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
- return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
- // Can't do a scalar atomic load.
- !MMO->isAtomic() &&
- // Don't use scalar loads for volatile accesses to non-constant address
- // spaces.
- (IsConst || !MMO->isVolatile()) &&
- // Memory must be known constant, or not written before this load.
- (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
- AMDGPUInstrInfo::isUniformMMO(MMO);
+ return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
+ // Can't do a scalar atomic load.
+ !MMO->isAtomic() &&
+ // Don't use scalar loads for volatile accesses to non-constant address
+ // spaces.
+ (IsConst || !MMO->isVolatile()) &&
+ // Memory must be known constant, or not written before this load.
+ (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
+ AMDGPUInstrInfo::isUniformMMO(MMO);
}
RegisterBankInfo::InstructionMappings
@@ -490,24 +532,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VVMapping);
-
- const InstructionMapping &SVMapping = getInstructionMapping(
- 3, 3, getOperandsMapping(
- {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
- 3); // Num Operands
- AltMappings.push_back(&SVMapping);
-
- // SGPR in LHS is slightly preferrable, so make it VS more expensive than
- // SV.
- const InstructionMapping &VSMapping = getInstructionMapping(
- 3, 4, getOperandsMapping(
- {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
- 3); // Num Operands
- AltMappings.push_back(&VSMapping);
break;
}
case TargetOpcode::G_LOAD:
@@ -517,7 +541,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
unsigned PtrSize = PtrTy.getSizeInBits();
unsigned AS = PtrTy.getAddressSpace();
- LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS) &&
@@ -531,9 +554,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
}
const InstructionMapping &VVMapping = getInstructionMapping(
- 2, 1, getOperandsMapping(
- {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
+ 2, 1,
+ getOperandsMapping(
+ {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
@@ -546,43 +570,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
- case TargetOpcode::G_ICMP: {
- // TODO: Should report 32-bit for scalar output type.
- unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
- const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
- nullptr, // Predicate operand.
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
- 4); // Num Operands
- AltMappings.push_back(&SSMapping);
-
- const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
- nullptr, // Predicate operand.
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
- 4); // Num Operands
- AltMappings.push_back(&SVMapping);
-
- const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
- nullptr, // Predicate operand.
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
- 4); // Num Operands
- AltMappings.push_back(&VSMapping);
-
- const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
- nullptr, // Predicate operand.
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
- 4); // Num Operands
- AltMappings.push_back(&VVMapping);
-
- return AltMappings;
- }
case TargetOpcode::G_SELECT: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
@@ -607,10 +594,8 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
case TargetOpcode::G_SMAX:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX: {
- static const OpRegBankEntry<3> Table[4] = {
+ static const OpRegBankEntry<3> Table[2] = {
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
- { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
- { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
// Scalar requires cmp+select, and extends if 16-bit.
// FIXME: Should there be separate costs for 32 and 16-bit
@@ -740,6 +725,10 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
SmallVector<Register, 4> InitResultRegs;
SmallVector<Register, 4> PhiRegs;
+ // Track use registers which have already been expanded with a readfirstlane
+ // sequence. This may have multiple uses if moving a sequence.
+ DenseMap<Register, Register> WaterfalledRegMap;
+
MachineBasicBlock &MBB = B.getMBB();
MachineFunction *MF = &B.getMF();
@@ -755,6 +744,10 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
const unsigned ExecReg = Subtarget.isWave32() ?
AMDGPU::EXEC_LO : AMDGPU::EXEC;
+#ifndef NDEBUG
+ const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
for (MachineInstr &MI : Range) {
for (MachineOperand &Def : MI.defs()) {
LLT ResTy = MRI.getType(Def.getReg());
@@ -820,13 +813,14 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
const DebugLoc &DL = B.getDL();
- // Figure out the iterator range after splicing the instructions.
- auto NewBegin = std::prev(LoopBB->end());
+ MachineInstr &FirstInst = *Range.begin();
// Move the instruction into the loop. Note we moved everything after
// Range.end() already into a new block, so Range.end() is no longer valid.
LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
+ // Figure out the iterator range after splicing the instructions.
+ MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
auto NewEnd = LoopBB->end();
MachineBasicBlock::iterator I = Range.begin();
@@ -834,39 +828,145 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
Register CondReg;
+ assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
for (MachineOperand &Op : MI.uses()) {
if (!Op.isReg() || Op.isDef())
continue;
- if (SGPROperandRegs.count(Op.getReg())) {
- LLT OpTy = MRI.getType(Op.getReg());
- unsigned OpSize = OpTy.getSizeInBits();
+ Register OldReg = Op.getReg();
+ if (!SGPROperandRegs.count(OldReg))
+ continue;
+
+ // See if we already processed this register in another instruction in the
+ // sequence.
+ auto OldVal = WaterfalledRegMap.find(OldReg);
+ if (OldVal != WaterfalledRegMap.end()) {
+ Op.setReg(OldVal->second);
+ continue;
+ }
+
+ LLT OpTy = MRI.getType(Op.getReg());
+ unsigned OpSize = OpTy.getSizeInBits();
+
+ // Can only do a readlane of 32-bit pieces.
+ if (OpSize == 32) {
+ // Avoid extra copies in the simple case of one 32-bit register.
+ Register CurrentLaneOpReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ MRI.setType(CurrentLaneOpReg, OpTy);
+
+ constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(Op.getReg());
+
+ Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
+
+ // Compare the just read M0 value to all possible Idx values.
+ B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(Op.getReg());
+ Op.setReg(CurrentLaneOpReg);
+
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(WaveRC);
+
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(WaveAndOpc)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ } else {
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 8> ReadlanePieces;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ bool Is64 = OpSize % 64 == 0;
+
+ LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+ unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+ : AMDGPU::V_CMP_EQ_U32_e64;
+
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
+
+ // Insert the unmerge before the loop.
+
+ B.setMBB(MBB);
+ auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ B.setInstr(*I);
+
+ unsigned NumPieces = Unmerge->getNumOperands() - 1;
+ for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+ Register UnmergePiece = Unmerge.getReg(PieceIdx);
+
+ Register CurrentLaneOpReg;
+ if (Is64) {
+ Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+ Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegLo)
+ .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegHi)
+ .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+ CurrentLaneOpReg =
+ B.buildMerge(LLT::scalar(64),
+ {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+ .getReg(0);
- // Can only do a readlane of 32-bit pieces.
- if (OpSize == 32) {
- // Avoid extra copies in the simple case of one 32-bit register.
- Register CurrentLaneOpReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- MRI.setType(CurrentLaneOpReg, OpTy);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
- constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(Op.getReg());
+ if (OpTy.getScalarSizeInBits() == 64) {
+ // If we need to produce a 64-bit element vector, so use the
+ // merged pieces
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ } else {
+ // 32-bit element type.
+ ReadlanePieces.push_back(CurrentLaneOpRegLo);
+ ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ }
+ } else {
+ CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ }
Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;
- // Compare the just read M0 value to all possible Idx values.
- B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+ B.buildInstr(CmpOp)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
- .addReg(Op.getReg());
- Op.setReg(CurrentLaneOpReg);
+ .addReg(UnmergePiece);
if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);
@@ -878,114 +978,23 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
.addReg(CondReg);
CondReg = AndReg;
}
- } else {
- LLT S32 = LLT::scalar(32);
- SmallVector<Register, 8> ReadlanePieces;
-
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
-
- bool Is64 = OpSize % 64 == 0;
-
- LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
- unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
- : AMDGPU::V_CMP_EQ_U32_e64;
-
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
-
- // Insert the unmerge before the loop.
-
- B.setMBB(MBB);
- auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
- B.setInstr(*I);
-
- unsigned NumPieces = Unmerge->getNumOperands() - 1;
- for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
- Register UnmergePiece = Unmerge.getReg(PieceIdx);
-
- Register CurrentLaneOpReg;
- if (Is64) {
- Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
- Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
-
- MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
- MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
- MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegLo)
- .addReg(UnmergePiece, 0, AMDGPU::sub0);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegHi)
- .addReg(UnmergePiece, 0, AMDGPU::sub1);
-
- CurrentLaneOpReg =
- B.buildMerge(LLT::scalar(64),
- {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
- .getReg(0);
-
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
-
- if (OpTy.getScalarSizeInBits() == 64) {
- // If we need to produce a 64-bit element vector, so use the
- // merged pieces
- ReadlanePieces.push_back(CurrentLaneOpReg);
- } else {
- // 32-bit element type.
- ReadlanePieces.push_back(CurrentLaneOpRegLo);
- ReadlanePieces.push_back(CurrentLaneOpRegHi);
- }
- } else {
- CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
- MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
-
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(UnmergePiece);
- ReadlanePieces.push_back(CurrentLaneOpReg);
- }
-
- Register NewCondReg = MRI.createVirtualRegister(WaveRC);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
-
- B.buildInstr(CmpOp)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(UnmergePiece);
-
- if (!First) {
- Register AndReg = MRI.createVirtualRegister(WaveRC);
-
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(WaveAndOpc)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
- }
- }
-
- // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
- // BUILD_VECTOR
- if (OpTy.isVector()) {
- auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- } else {
- auto Merge = B.buildMerge(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- }
+ }
- MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
+ // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+ // BUILD_VECTOR
+ if (OpTy.isVector()) {
+ auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ } else {
+ auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
}
+
+ MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
}
+
+ // Make sure we don't re-process this register again.
+ WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
}
}
@@ -1093,53 +1102,89 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
MI.getOperand(OpIdx).setReg(SGPR);
}
-// When regbankselect repairs registers, it will insert a repair instruction
-// which defines the repaired register. Then it calls applyMapping and expects
-// that the targets will either delete or rewrite the originally wrote to the
-// repaired registers. Beccause of this, we end up in a situation where
-// we have 2 instructions defining the same registers.
-static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
- Register Reg,
- const MachineInstr &MI) {
- // Is there some way we can assert that there are exactly 2 def instructions?
- for (MachineInstr &Other : MRI.def_instructions(Reg)) {
- if (&Other != &MI)
- return &Other;
- }
-
- return nullptr;
+/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
+/// rest will be in the remainder.
+static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
+ unsigned TotalSize = Ty.getSizeInBits();
+ if (!Ty.isVector())
+ return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
+
+ LLT EltTy = Ty.getElementType();
+ unsigned EltSize = EltTy.getSizeInBits();
+ assert(FirstSize % EltSize == 0);
+
+ unsigned FirstPartNumElts = FirstSize / EltSize;
+ unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
+
+ return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
+ LLT::scalarOrVector(RemainderElts, EltTy)};
+}
+
+static LLT widen96To128(LLT Ty) {
+ if (!Ty.isVector())
+ return LLT::scalar(128);
+
+ LLT EltTy = Ty.getElementType();
+ assert(128 % EltTy.getSizeInBits() == 0);
+ return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
}
-bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
+bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const {
Register DstReg = MI.getOperand(0).getReg();
- const LLT LoadTy = MRI.getType(DstReg);
+ const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;
+
+ const RegisterBank *PtrBank =
+ OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+ if (PtrBank == &AMDGPU::SGPRRegBank) {
+ // If the pointer is an SGPR, we ordinarily have nothing to do.
+ if (LoadSize != 96)
+ return false;
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ Register PtrReg = MI.getOperand(1).getReg();
+ // 96-bit loads are only available for vector loads. We need to split this
+ // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
+
+ MachineIRBuilder B(MI);
+ ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&O);
+ B.setChangeObserver(Observer);
+
+ if (MMO->getAlign() < Align(16)) {
+ LLT Part64, Part32;
+ std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
+ auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
+ auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
+
+ auto Undef = B.buildUndef(LoadTy);
+ auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
+ B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ } else {
+ LLT WiderTy = widen96To128(LoadTy);
+ auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
+ B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+
// 128-bit loads are supported for all instruction types.
if (LoadSize <= MaxNonSmrdLoadSize)
return false;
- SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
- SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
+ SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
+ SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
- // If the pointer is an SGPR, we have nothing to do.
- if (SrcRegs.empty()) {
- const RegisterBank *PtrBank =
- OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
- if (PtrBank == &AMDGPU::SGPRRegBank)
- return false;
+ if (SrcRegs.empty())
SrcRegs.push_back(MI.getOperand(1).getReg());
- }
assert(LoadSize % MaxNonSmrdLoadSize == 0);
- // We want to get the repair instruction now, because it will help us
- // determine which instruction the legalizer inserts that will also
- // write to DstReg.
- MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
-
// RegBankSelect only emits scalar types, so we need to reset the pointer
// operand to a pointer type.
Register BasePtrReg = SrcRegs[0];
@@ -1148,38 +1193,72 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
MachineIRBuilder B(MI);
- unsigned SplitElts =
- MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
- const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
+ unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
+ const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
GISelObserverWrapper Observer(&O);
B.setChangeObserver(Observer);
LegalizerHelper Helper(B.getMF(), Observer, B);
- if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+
+ if (LoadTy.isVector()) {
+ if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+ return false;
+ } else {
+ if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+ return false;
+ }
+
+ MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+ return true;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
+ MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const {
+ const MachineFunction &MF = *MI.getMF();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const auto &TFI = *ST.getFrameLowering();
+
+ // Guard in case the stack growth direction ever changes with scratch
+ // instructions.
+ if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
return false;
- // At this point, the legalizer has split the original load into smaller
- // loads. At the end of lowering, it inserts an instruction (LegalizedInst)
- // that combines the outputs of the lower loads and writes it to DstReg.
- // The register bank selector has also added the RepairInst which writes to
- // DstReg as well.
+ Register Dst = MI.getOperand(0).getReg();
+ Register AllocSize = MI.getOperand(1).getReg();
+ Align Alignment = assumeAligned(MI.getOperand(2).getImm());
- MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
+ const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
- // Replace the output of the LegalizedInst with a temporary register, since
- // RepairInst already defines DstReg.
- Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
- LegalizedInst->getOperand(0).setReg(TmpReg);
- B.setInsertPt(*RepairInst->getParent(), RepairInst);
+ // TODO: Need to emit a wave reduction to get the maximum size.
+ if (SizeBank != &AMDGPU::SGPRRegBank)
+ return false;
- for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
- Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
- B.buildConstant(IdxReg, DefIdx);
- MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank);
- B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
+ LLT PtrTy = MRI.getType(Dst);
+ LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ Register SPReg = Info->getStackPtrOffsetReg();
+ ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplyBank);
+
+ MachineIRBuilder B(MI);
+ B.setChangeObserver(Observer);
+
+ auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
+ auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
+
+ auto SPCopy = B.buildCopy(PtrTy, SPReg);
+ if (Alignment > TFI.getStackAlign()) {
+ auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
+ B.buildMaskLowPtrBits(Dst, PtrAdd,
+ Log2(Alignment) + ST.getWavefrontSizeLog2());
+ } else {
+ B.buildPtrAdd(Dst, SPCopy, ScaledSize);
}
- MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+ MI.eraseFromParent();
return true;
}
@@ -1210,6 +1289,281 @@ bool AMDGPURegisterBankInfo::applyMappingImage(
return true;
}
+static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
+ Register Reg) {
+ MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (!Def)
+ return Reg;
+
+ // TODO: Guard against this being an implicit def
+ return Def->getOperand(0).getReg();
+}
+
+// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
+// the three offsets (voffset, soffset and instoffset)
+static unsigned setBufferOffsets(MachineIRBuilder &B,
+ const AMDGPURegisterBankInfo &RBI,
+ Register CombinedOffset, Register &VOffsetReg,
+ Register &SOffsetReg, int64_t &InstOffsetVal,
+ Align Alignment) {
+ const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo *MRI = B.getMRI();
+
+ if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
+ uint32_t SOffset, ImmOffset;
+ if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
+ Alignment)) {
+ VOffsetReg = B.buildConstant(S32, 0).getReg(0);
+ SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
+ InstOffsetVal = ImmOffset;
+
+ B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
+ B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
+ return SOffset + ImmOffset;
+ }
+ }
+
+ Register Base;
+ unsigned Offset;
+ MachineInstr *Unused;
+
+ std::tie(Base, Offset, Unused)
+ = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
+
+ uint32_t SOffset, ImmOffset;
+ if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+ &RBI.Subtarget, Alignment)) {
+ if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
+ VOffsetReg = Base;
+ SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
+ B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
+ InstOffsetVal = ImmOffset;
+ return 0; // XXX - Why is this 0?
+ }
+
+ // If we have SGPR base, we can use it for soffset.
+ if (SOffset == 0) {
+ VOffsetReg = B.buildConstant(S32, 0).getReg(0);
+ B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
+ SOffsetReg = Base;
+ InstOffsetVal = ImmOffset;
+ return 0; // XXX - Why is this 0?
+ }
+ }
+
+ // Handle the variable sgpr + vgpr case.
+ if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
+ Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
+ Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
+
+ const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
+ const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
+
+ if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
+ VOffsetReg = Src0;
+ SOffsetReg = Src1;
+ return 0;
+ }
+
+ if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
+ VOffsetReg = Src1;
+ SOffsetReg = Src0;
+ return 0;
+ }
+ }
+
+ // Ensure we have a VGPR for the combined offset. This could be an issue if we
+ // have an SGPR offset and a VGPR resource.
+ if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
+ VOffsetReg = CombinedOffset;
+ } else {
+ VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
+ B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
+ }
+
+ SOffsetReg = B.buildConstant(S32, 0).getReg(0);
+ B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
+ return 0;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
+ const OperandsMapper &OpdMapper) const {
+ MachineInstr &MI = OpdMapper.getMI();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+ const LLT S32 = LLT::scalar(32);
+ Register Dst = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(Dst);
+
+ const RegisterBank *RSrcBank =
+ OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+ const RegisterBank *OffsetBank =
+ OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+ if (RSrcBank == &AMDGPU::SGPRRegBank &&
+ OffsetBank == &AMDGPU::SGPRRegBank)
+ return true; // Legal mapping
+
+ // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
+ // here but don't have an MMO.
+
+ unsigned LoadSize = Ty.getSizeInBits();
+ int NumLoads = 1;
+ if (LoadSize == 256 || LoadSize == 512) {
+ NumLoads = LoadSize / 128;
+ Ty = Ty.divide(NumLoads);
+ }
+
+ // Use the alignment to ensure that the required offsets will fit into the
+ // immediate offsets.
+ const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
+
+ MachineIRBuilder B(MI);
+ MachineFunction &MF = B.getMF();
+
+ Register SOffset;
+ Register VOffset;
+ int64_t ImmOffset = 0;
+
+ unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
+ VOffset, SOffset, ImmOffset, Alignment);
+
+ // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
+ // can, but we neeed to track an MMO for that.
+ const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
+ const Align MemAlign(4); // FIXME: ABI type alignment?
+ MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ MemSize, MemAlign);
+ if (MMOOffset != 0)
+ BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
+
+ // If only the offset is divergent, emit a MUBUF buffer load instead. We can
+ // assume that the buffer is unswizzled.
+
+ Register RSrc = MI.getOperand(1).getReg();
+ Register VIndex = B.buildConstant(S32, 0).getReg(0);
+ B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
+
+ SmallVector<Register, 4> LoadParts(NumLoads);
+
+ MachineBasicBlock::iterator MII = MI.getIterator();
+ MachineInstrSpan Span(MII, &B.getMBB());
+
+ for (int i = 0; i < NumLoads; ++i) {
+ if (NumLoads == 1) {
+ LoadParts[i] = Dst;
+ } else {
+ LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
+ MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
+ }
+
+ MachineMemOperand *MMO = BaseMMO;
+ if (i != 0)
+ BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
+
+ B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
+ .addDef(LoadParts[i]) // vdata
+ .addUse(RSrc) // rsrc
+ .addUse(VIndex) // vindex
+ .addUse(VOffset) // voffset
+ .addUse(SOffset) // soffset
+ .addImm(ImmOffset + 16 * i) // offset(imm)
+ .addImm(0) // cachepolicy, swizzled buffer(imm)
+ .addImm(0) // idxen(imm)
+ .addMemOperand(MMO);
+ }
+
+ // TODO: If only the resource is a VGPR, it may be better to execute the
+ // scalar load in the waterfall loop if the resource is expected to frequently
+ // be dynamically uniform.
+ if (RSrcBank != &AMDGPU::SGPRRegBank) {
+ // Remove the original instruction to avoid potentially confusing the
+ // waterfall loop logic.
+ B.setInstr(*Span.begin());
+ MI.eraseFromParent();
+
+ SmallSet<Register, 4> OpsToWaterfall;
+
+ OpsToWaterfall.insert(RSrc);
+ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+ OpsToWaterfall, MRI);
+ }
+
+ if (NumLoads != 1) {
+ if (Ty.isVector())
+ B.buildConcatVectors(Dst, LoadParts);
+ else
+ B.buildMerge(Dst, LoadParts);
+ }
+
+ // We removed the instruction earlier with a waterfall loop.
+ if (RSrcBank == &AMDGPU::SGPRRegBank)
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
+ const OperandsMapper &OpdMapper, bool Signed) const {
+ MachineInstr &MI = OpdMapper.getMI();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+ // Insert basic copies
+ applyDefaultMapping(OpdMapper);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DstReg);
+
+ const LLT S32 = LLT::scalar(32);
+
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::VGPRRegBank) {
+ if (Ty == S32)
+ return true;
+
+ // TODO: 64-bit version is scalar only, so we need to expand this.
+ return false;
+ }
+
+ Register SrcReg = MI.getOperand(2).getReg();
+ Register OffsetReg = MI.getOperand(3).getReg();
+ Register WidthReg = MI.getOperand(4).getReg();
+
+ // The scalar form packs the offset and width in a single operand.
+
+ ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplyBank);
+ MachineIRBuilder B(MI);
+ B.setChangeObserver(Observer);
+
+ // Ensure the high bits are clear to insert the offset.
+ auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
+ auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+ // Zeros out the low bits, so don't bother clamping the input value.
+ auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
+
+ // Transformation function, pack the offset and width of a BFE into
+ // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+ // source, bits [5:0] contain the offset and bits [22:16] the width.
+ auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+ // TODO: It might be worth using a pseudo here to avoid scc clobber and
+ // register class constraints.
+ unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
+ (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+ auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+ if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
+ llvm_unreachable("failed to constrain BFE");
+
+ MI.eraseFromParent();
+ return true;
+}
+
// FIXME: Duplicated from LegalizerHelper
static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
switch (Opc) {
@@ -1226,6 +1580,51 @@ static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
}
}
+static unsigned minMaxToExtend(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_SMIN:
+ case TargetOpcode::G_SMAX:
+ return TargetOpcode::G_SEXT;
+ case TargetOpcode::G_UMIN:
+ case TargetOpcode::G_UMAX:
+ return TargetOpcode::G_ZEXT;
+ default:
+ llvm_unreachable("not in integer min/max");
+ }
+}
+
+// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
+// any illegal vector extend or unmerge operations.
+static std::pair<Register, Register>
+unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
+ const LLT S32 = LLT::scalar(32);
+ auto Bitcast = B.buildBitcast(S32, Src);
+
+ if (ExtOpcode == TargetOpcode::G_SEXT) {
+ auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
+ auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
+ return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
+ }
+
+ auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
+ if (ExtOpcode == TargetOpcode::G_ZEXT) {
+ auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
+ return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
+ }
+
+ assert(ExtOpcode == TargetOpcode::G_ANYEXT);
+ return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
+}
+
+static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
+ CmpInst::Predicate Pred,
+ Register Dst, Register Src0,
+ Register Src1) {
+ const LLT CmpType = LLT::scalar(32);
+ auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
+ return B.buildSelect(Dst, Cmp, Src0, Src1);
+}
+
// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
MachineInstr &MI) const {
@@ -1234,24 +1633,25 @@ void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
Register Src1 = MI.getOperand(2).getReg();
const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
- LLT CmpType = LLT::scalar(32);
-
- auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
- B.buildSelect(Dst, Cmp, Src0, Src1);
+ MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
- B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank);
+ Register CmpReg = Sel->getOperand(1).getReg();
+ B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
MI.eraseFromParent();
}
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
-static void substituteSimpleCopyRegs(
+static bool substituteSimpleCopyRegs(
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
if (!SrcReg.empty()) {
assert(SrcReg.size() == 1);
OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
+ return true;
}
+
+ return false;
}
/// Handle register layout difference for f16 images for some subtargets.
@@ -1465,6 +1865,223 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
}
+/// Utility function for pushing dynamic vector indexes with a constant offset
+/// into waterwall loops.
+static void reinsertVectorIndexAdd(MachineIRBuilder &B,
+ MachineInstr &IdxUseInstr,
+ unsigned OpIdx,
+ unsigned ConstOffset) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ const LLT S32 = LLT::scalar(32);
+ Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
+ B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
+
+ auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
+
+ auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
+ MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
+ IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
+}
+
+/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
+/// original 32-bit source value (to be inserted in the low part of the combined
+/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
+/// value.
+static void extendLow32IntoHigh32(MachineIRBuilder &B,
+ Register Hi32Reg, Register Lo32Reg,
+ unsigned ExtOpc,
+ const RegisterBank &RegBank,
+ bool IsBooleanSrc = false) {
+ if (ExtOpc == AMDGPU::G_ZEXT) {
+ B.buildConstant(Hi32Reg, 0);
+ } else if (ExtOpc == AMDGPU::G_SEXT) {
+ if (IsBooleanSrc) {
+ // If we know the original source was an s1, the high half is the same as
+ // the low.
+ B.buildCopy(Hi32Reg, Lo32Reg);
+ } else {
+ // Replicate sign bit from 32-bit extended part.
+ auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
+ B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
+ B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
+ }
+ } else {
+ assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
+ B.buildUndef(Hi32Reg);
+ }
+}
+
+bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ const OperandsMapper &OpdMapper) const {
+
+ Register VecReg = MI.getOperand(1).getReg();
+ Register Idx = MI.getOperand(2).getReg();
+
+ const RegisterBank &IdxBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+ bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+
+ LLT VecTy = MRI.getType(VecReg);
+ unsigned EltSize = VecTy.getScalarSizeInBits();
+ unsigned NumElem = VecTy.getNumElements();
+
+ if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+ IsDivergentIdx))
+ return false;
+
+ MachineIRBuilder B(MI);
+ LLT S32 = LLT::scalar(32);
+
+ const RegisterBank &DstBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ const RegisterBank &SrcBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+
+ const RegisterBank &CCBank =
+ (DstBank == AMDGPU::SGPRRegBank &&
+ SrcBank == AMDGPU::SGPRRegBank &&
+ IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
+ : AMDGPU::VCCRegBank;
+ LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
+
+ if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
+ Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
+ MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
+ }
+
+ LLT EltTy = VecTy.getScalarType();
+ SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+ unsigned NumLanes = DstRegs.size();
+ if (!NumLanes)
+ NumLanes = 1;
+ else
+ EltTy = MRI.getType(DstRegs[0]);
+
+ auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
+ SmallVector<Register, 2> Res(NumLanes);
+ for (unsigned L = 0; L < NumLanes; ++L)
+ Res[L] = UnmergeToEltTy.getReg(L);
+
+ for (unsigned I = 1; I < NumElem; ++I) {
+ auto IC = B.buildConstant(S32, I);
+ MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
+ auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
+ MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
+
+ for (unsigned L = 0; L < NumLanes; ++L) {
+ auto S = B.buildSelect(EltTy, Cmp,
+ UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
+
+ for (unsigned N : { 0, 2, 3 })
+ MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+
+ Res[L] = S->getOperand(0).getReg();
+ }
+ }
+
+ for (unsigned L = 0; L < NumLanes; ++L) {
+ Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
+ B.buildCopy(DstReg, Res[L]);
+ MRI.setRegBank(DstReg, DstBank);
+ }
+
+ MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ const OperandsMapper &OpdMapper) const {
+
+ Register VecReg = MI.getOperand(1).getReg();
+ Register Idx = MI.getOperand(3).getReg();
+
+ const RegisterBank &IdxBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
+
+ bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+
+ LLT VecTy = MRI.getType(VecReg);
+ unsigned EltSize = VecTy.getScalarSizeInBits();
+ unsigned NumElem = VecTy.getNumElements();
+
+ if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+ IsDivergentIdx))
+ return false;
+
+ MachineIRBuilder B(MI);
+ LLT S32 = LLT::scalar(32);
+
+ const RegisterBank &DstBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ const RegisterBank &SrcBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+ const RegisterBank &InsBank =
+ *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+ const RegisterBank &CCBank =
+ (DstBank == AMDGPU::SGPRRegBank &&
+ SrcBank == AMDGPU::SGPRRegBank &&
+ InsBank == AMDGPU::SGPRRegBank &&
+ IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
+ : AMDGPU::VCCRegBank;
+ LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
+
+ if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
+ Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
+ MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
+ }
+
+ LLT EltTy = VecTy.getScalarType();
+ SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
+ unsigned NumLanes = InsRegs.size();
+ if (!NumLanes) {
+ NumLanes = 1;
+ InsRegs.push_back(MI.getOperand(2).getReg());
+ } else {
+ EltTy = MRI.getType(InsRegs[0]);
+ }
+
+ auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
+ SmallVector<Register, 16> Ops(NumElem * NumLanes);
+
+ for (unsigned I = 0; I < NumElem; ++I) {
+ auto IC = B.buildConstant(S32, I);
+ MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
+ auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
+ MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
+
+ for (unsigned L = 0; L < NumLanes; ++L) {
+ auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
+ UnmergeToEltTy.getReg(I * NumLanes + L));
+
+ for (unsigned N : { 0, 2, 3 })
+ MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+
+ Ops[I * NumLanes + L] = S->getOperand(0).getReg();
+ }
+ }
+
+ LLT MergeTy = LLT::vector(Ops.size(), EltTy);
+ if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
+ B.buildBuildVector(MI.getOperand(0), Ops);
+ } else {
+ auto Vec = B.buildBuildVector(MergeTy, Ops);
+ MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
+ B.buildBitcast(MI.getOperand(0).getReg(), Vec);
+ }
+
+ MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
+ MI.eraseFromParent();
+
+ return true;
+}
+
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -1555,7 +2172,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MachineBasicBlock *MBB = MI.getParent();
B.setInsertPt(*MBB, std::next(MI.getIterator()));
- B.buildTrunc(DstReg, NewDstReg);
+
+ // If we had a constrained VCC result register, a copy was inserted to VCC
+ // from SGPR.
+ SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
+ if (DefRegs.empty())
+ DefRegs.push_back(DstReg);
+ B.buildTrunc(DefRegs[0], NewDstReg);
return;
}
case AMDGPU::G_SELECT: {
@@ -1712,10 +2335,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
}
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
- case AMDGPU::G_MUL: {
+ case AMDGPU::G_MUL:
+ case AMDGPU::G_SHL:
+ case AMDGPU::G_LSHR:
+ case AMDGPU::G_ASHR: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy != LLT::scalar(16))
+
+ // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+ // Packed 16-bit operations need to be scalarized and promoted.
+ if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
break;
const RegisterBank *DstBank =
@@ -1723,16 +2352,42 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DstBank == &AMDGPU::VGPRRegBank)
break;
- // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
- MachineFunction *MF = MI.getParent()->getParent();
+ const LLT S32 = LLT::scalar(32);
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
MachineIRBuilder B(MI);
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
- LegalizerHelper Helper(*MF, Observer, B);
- if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
- LegalizerHelper::Legalized)
- llvm_unreachable("widen scalar should have succeeded");
+ if (DstTy.isVector()) {
+ B.setChangeObserver(Observer);
+
+ Register WideSrc0Lo, WideSrc0Hi;
+ Register WideSrc1Lo, WideSrc1Hi;
+
+ std::tie(WideSrc0Lo, WideSrc0Hi)
+ = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
+ std::tie(WideSrc1Lo, WideSrc1Hi)
+ = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
+ auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
+ auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
+ B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
+ MI.eraseFromParent();
+ } else {
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+
+ // FIXME: s16 shift amounts should be legal.
+ if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
+ Opc == AMDGPU::G_ASHR) {
+ B.setInsertPt(*MBB, MI.getIterator());
+ if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+ }
+ }
+
return;
}
case AMDGPU::G_SMIN:
@@ -1750,10 +2405,44 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Turn scalar min/max into a compare and select.
LLT Ty = MRI.getType(DstReg);
- LLT S32 = LLT::scalar(32);
- LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+ const LLT S16 = LLT::scalar(16);
+ const LLT V2S16 = LLT::vector(2, 16);
- if (Ty == S16) {
+ if (Ty == V2S16) {
+ ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplySALU);
+ B.setChangeObserver(Observer);
+
+ // Need to widen to s32, and expand as cmp + select, and avoid producing
+ // illegal vector extends or unmerges that would need further
+ // legalization.
+ //
+ // TODO: Should we just readfirstlane? That should probably be handled
+ // with a UniformVGPR register bank that wouldn't need special
+ // consideration here.
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+
+ Register WideSrc0Lo, WideSrc0Hi;
+ Register WideSrc1Lo, WideSrc1Hi;
+
+ unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
+
+ std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
+ std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
+
+ Register Lo = MRI.createGenericVirtualRegister(S32);
+ Register Hi = MRI.createGenericVirtualRegister(S32);
+ const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
+ buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
+ buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
+
+ B.buildBuildVectorTrunc(Dst, {Lo, Hi});
+ MI.eraseFromParent();
+ } else if (Ty == S16) {
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);
@@ -1769,11 +2458,77 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
+ case AMDGPU::G_SEXT_INREG: {
+ SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
+ if (SrcRegs.empty())
+ break; // Nothing to repair
+
+ const LLT S32 = LLT::scalar(32);
+ MachineIRBuilder B(MI);
+ ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
+ GISelObserverWrapper Observer(&O);
+ B.setChangeObserver(Observer);
+
+ // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
+ // we would need to further expand, and doesn't let us directly set the
+ // result registers.
+ SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+
+ int Amt = MI.getOperand(2).getImm();
+ if (Amt <= 32) {
+ if (Amt == 32) {
+ // The low bits are unchanged.
+ B.buildCopy(DstRegs[0], SrcRegs[0]);
+ } else {
+ // Extend in the low bits and propagate the sign bit to the high half.
+ B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
+ }
+
+ B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
+ } else {
+ // The low bits are unchanged, and extend in the high bits.
+ B.buildCopy(DstRegs[0], SrcRegs[0]);
+ B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_CTPOP:
+ case AMDGPU::G_CTLZ_ZERO_UNDEF:
+ case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+ MachineIRBuilder B(MI);
+ MachineFunction &MF = B.getMF();
+
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::SGPRRegBank)
+ break;
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ const LLT S32 = LLT::scalar(32);
+ LLT Ty = MRI.getType(SrcReg);
+ if (Ty == S32)
+ break;
+
+ ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
+ GISelObserverWrapper Observer(&ApplyVALU);
+ LegalizerHelper Helper(MF, Observer, B);
+
+ if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("narrowScalar should have succeeded");
+ return;
+ }
case AMDGPU::G_SEXT:
- case AMDGPU::G_ZEXT: {
+ case AMDGPU::G_ZEXT:
+ case AMDGPU::G_ANYEXT: {
Register SrcReg = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
- bool Signed = Opc == AMDGPU::G_SEXT;
+ const bool Signed = Opc == AMDGPU::G_SEXT;
+
+ assert(empty(OpdMapper.getVRegs(1)));
MachineIRBuilder B(MI);
const RegisterBank *SrcBank =
@@ -1788,23 +2543,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// breakdowns supported.
DstTy.getSizeInBits() == 64 &&
SrcTy.getSizeInBits() <= 32) {
- const LLT S32 = LLT::scalar(32);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
// Extend to 32-bit, and then extend the low half.
if (Signed) {
// TODO: Should really be buildSExtOrCopy
B.buildSExtOrTrunc(DefRegs[0], SrcReg);
-
- // Replicate sign bit from 32-bit extended part.
- auto ShiftAmt = B.buildConstant(S32, 31);
- MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
- B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
- } else {
+ } else if (Opc == AMDGPU::G_ZEXT) {
B.buildZExtOrTrunc(DefRegs[0], SrcReg);
- B.buildConstant(DefRegs[1], 0);
+ } else {
+ B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
}
+ extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
MRI.setRegBank(DstReg, *SrcBank);
MI.eraseFromParent();
return;
@@ -1813,6 +2564,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (SrcTy != LLT::scalar(1))
return;
+ // It is not legal to have a legalization artifact with a VCC source. Rather
+ // than introducing a copy, insert the select we would have to select the
+ // copy to.
if (SrcBank == &AMDGPU::VCCRegBank) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
@@ -1834,7 +2588,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DstSize > 32) {
B.buildSelect(DefRegs[0], SrcReg, True, False);
- B.buildCopy(DefRegs[1], DefRegs[0]);
+ extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
} else if (DstSize < 32) {
auto Sel = B.buildSelect(SelType, SrcReg, True, False);
MRI.setRegBank(Sel.getReg(0), *DstBank);
@@ -1847,24 +2601,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
- // Fixup the case with an s1 src that isn't a condition register. Use shifts
- // instead of introducing a compare to avoid an unnecessary condition
- // register (and since there's no scalar 16-bit compares).
- auto Ext = B.buildAnyExt(DstTy, SrcReg);
- auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
- auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
-
- if (MI.getOpcode() == AMDGPU::G_SEXT)
- B.buildAShr(DstReg, Shl, ShiftAmt);
- else
- B.buildLShr(DstReg, Shl, ShiftAmt);
-
- MRI.setRegBank(DstReg, *SrcBank);
- MRI.setRegBank(Ext.getReg(0), *SrcBank);
- MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
- MRI.setRegBank(Shl.getReg(0), *SrcBank);
- MI.eraseFromParent();
- return;
+ break;
}
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
@@ -1934,7 +2671,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
- LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ const LLT S32 = LLT::scalar(32);
+ LLT DstTy = MRI.getType(DstReg);
+ LLT SrcTy = MRI.getType(SrcReg);
+
+ if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
+ return;
+
MachineIRBuilder B(MI);
const ValueMapping &DstMapping
@@ -1942,10 +2688,26 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
-
- Register DstReg = MI.getOperand(0).getReg();
- Register SrcReg = MI.getOperand(1).getReg();
- Register IdxReg = MI.getOperand(2).getReg();
+ const RegisterBank *IdxBank =
+ OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+ Register BaseIdxReg;
+ unsigned ConstOffset;
+ MachineInstr *OffsetDef;
+ std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
+ AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
+
+ // See if the index is an add of a constant which will be foldable by moving
+ // the base register of the index later if this is going to be executed in a
+ // waterfall loop. This is essentially to reassociate the add of a constant
+ // with the readfirstlane.
+ bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
+ ConstOffset > 0 &&
+ ConstOffset < SrcTy.getNumElements();
+
+ // Move the base register. We'll re-insert the add later.
+ if (ShouldMoveIndexIntoLoop)
+ MI.getOperand(2).setReg(BaseIdxReg);
// If this is a VGPR result only because the index was a VGPR result, the
// actual indexing will be done on the SGPR source vector, which will
@@ -1969,26 +2731,30 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
buildVCopy(B, DstReg, TmpReg);
}
+ // Re-insert the constant offset add inside the waterfall loop.
+ if (ShouldMoveIndexIntoLoop)
+ reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
+
return;
}
assert(DstTy.getSizeInBits() == 64);
- LLT SrcTy = MRI.getType(SrcReg);
- const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
+ MachineBasicBlock::iterator MII = MI.getIterator();
+
// Split the vector index into 32-bit pieces. Prepare to move all of the
// new instructions into a waterfall loop if necessary.
//
// Don't put the bitcast or constant in the loop.
- MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+ MachineInstrSpan Span(MII, &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
- auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
@@ -2029,33 +2795,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
buildVCopy(B, DstRegs[1], TmpReg1);
}
+ if (ShouldMoveIndexIntoLoop)
+ reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
+
return;
}
case AMDGPU::G_INSERT_VECTOR_ELT: {
SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT VecTy = MRI.getType(DstReg);
+
assert(OpdMapper.getVRegs(0).empty());
- assert(OpdMapper.getVRegs(1).empty());
assert(OpdMapper.getVRegs(3).empty());
- if (InsRegs.empty()) {
- applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, { 3 });
+ if (substituteSimpleCopyRegs(OpdMapper, 1))
+ MRI.setType(MI.getOperand(1).getReg(), VecTy);
+
+ if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
return;
- }
- Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *IdxBank =
+ OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
+
Register SrcReg = MI.getOperand(1).getReg();
Register InsReg = MI.getOperand(2).getReg();
- Register IdxReg = MI.getOperand(3).getReg();
- LLT SrcTy = MRI.getType(SrcReg);
LLT InsTy = MRI.getType(InsReg);
(void)InsTy;
+ Register BaseIdxReg;
+ unsigned ConstOffset;
+ MachineInstr *OffsetDef;
+ std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
+ AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
+
+ // See if the index is an add of a constant which will be foldable by moving
+ // the base register of the index later if this is going to be executed in a
+ // waterfall loop. This is essentially to reassociate the add of a constant
+ // with the readfirstlane.
+ bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
+ ConstOffset > 0 &&
+ ConstOffset < VecTy.getNumElements();
+
+ // Move the base register. We'll re-insert the add later.
+ if (ShouldMoveIndexIntoLoop)
+ MI.getOperand(3).setReg(BaseIdxReg);
+
+
+ if (InsRegs.empty()) {
+ executeInWaterfallLoop(MI, MRI, { 3 });
+
+ // Re-insert the constant offset add inside the waterfall loop.
+ if (ShouldMoveIndexIntoLoop) {
+ MachineIRBuilder B(MI);
+ reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
+ }
+
+ return;
+ }
+
+
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
- LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+ LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@@ -2068,12 +2871,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
- auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
- B.buildBitcast(DstReg, InsHi);
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
@@ -2093,6 +2895,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
SmallSet<Register, 4> OpsToWaterfall;
if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
+ B.setInsertPt(B.getMBB(), MI);
+ B.buildBitcast(DstReg, InsHi);
MI.eraseFromParent();
return;
}
@@ -2100,17 +2904,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
B.setInstr(*Span.begin());
MI.eraseFromParent();
+ // Figure out the point after the waterfall loop before mangling the control
+ // flow.
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
+
+ // The insertion point is now right after the original instruction.
+ //
+ // Keep the bitcast to the original vector type out of the loop. Doing this
+ // saved an extra phi we don't need inside the loop.
+ B.buildBitcast(DstReg, InsHi);
+
+ // Re-insert the constant offset add inside the waterfall loop.
+ if (ShouldMoveIndexIntoLoop)
+ reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
+
+ return;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
+ case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
+ case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {1, 4});
+ return;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 5});
+ return;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {3, 6});
+ return;
+ }
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ applyMappingSBufferLoad(OpdMapper);
return;
}
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
- case Intrinsic::amdgcn_s_buffer_load: {
- // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
- executeInWaterfallLoop(MI, MRI, { 2, 3 });
- return;
- }
case Intrinsic::amdgcn_readlane: {
substituteSimpleCopyRegs(OpdMapper, 2);
@@ -2132,18 +2989,51 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 3); // Index
return;
}
- default:
- break;
+ case Intrinsic::amdgcn_ballot:
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1_f16:
+ case Intrinsic::amdgcn_interp_p2_f16: {
+ applyDefaultMapping(OpdMapper);
+
+ // Readlane for m0 value, which is always the last operand.
+ // FIXME: Should this be a waterfall loop instead?
+ constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+ return;
+ }
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ // Doing a waterfall loop over these wouldn't make any sense.
+ substituteSimpleCopyRegs(OpdMapper, 2);
+ substituteSimpleCopyRegs(OpdMapper, 3);
+ constrainOpWithReadfirstlane(MI, MRI, 4);
+ constrainOpWithReadfirstlane(MI, MRI, 5);
+ return;
+ }
+ case Intrinsic::amdgcn_sbfe:
+ applyMappingBFEIntrinsic(OpdMapper, true);
+ return;
+ case Intrinsic::amdgcn_ubfe:
+ applyMappingBFEIntrinsic(OpdMapper, false);
+ return;
}
break;
}
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ const AMDGPU::RsrcIntrinsic *RSrcIntrin
+ = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
+ assert(RSrcIntrin && RSrcIntrin->IsImage);
+ // Non-images can have complications from operands that allow both SGPR
+ // and VGPR. For now it's too complicated to figure out the final opcode
+ // to derive the register bank from the MCInstrDesc.
+ applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+ return;
+ }
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
- case Intrinsic::amdgcn_buffer_load: {
- executeInWaterfallLoop(MI, MRI, { 2 });
- return;
- }
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// This is only allowed to execute with 1 lane, so readfirstlane is safe.
@@ -2167,28 +3057,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 1); // M0
return;
}
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume: {
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
- case Intrinsic::amdgcn_raw_buffer_load:
- case Intrinsic::amdgcn_raw_buffer_load_format:
- case Intrinsic::amdgcn_raw_tbuffer_load:
- case Intrinsic::amdgcn_raw_buffer_store:
- case Intrinsic::amdgcn_raw_buffer_store_format:
- case Intrinsic::amdgcn_raw_tbuffer_store: {
- applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, {2, 4});
- return;
- }
- case Intrinsic::amdgcn_struct_buffer_load:
- case Intrinsic::amdgcn_struct_buffer_store:
- case Intrinsic::amdgcn_struct_tbuffer_load:
- case Intrinsic::amdgcn_struct_tbuffer_store: {
- applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, {2, 5});
+ case Intrinsic::amdgcn_s_setreg: {
+ constrainOpWithReadfirstlane(MI, MRI, 2);
return;
}
default: {
@@ -2211,10 +3092,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_LOAD:
case AMDGPU::G_ZEXTLOAD:
case AMDGPU::G_SEXTLOAD: {
- if (applyMappingWideLoad(MI, OpdMapper, MRI))
+ if (applyMappingLoad(MI, OpdMapper, MRI))
return;
break;
}
+ case AMDGPU::G_DYN_STACKALLOC:
+ applyMappingDynStackAlloc(MI, OpdMapper, MRI);
+ return;
default:
break;
}
@@ -2244,7 +3128,11 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
- unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+ const MachineOperand &SrcOp = MI.getOperand(i);
+ if (!SrcOp.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -2256,31 +3144,19 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
- unsigned OpdIdx = 0;
-
- unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
- OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
-
- if (MI.getOperand(OpdIdx).isIntrinsicID())
- OpdsMapping[OpdIdx++] = nullptr;
- Register Reg1 = MI.getOperand(OpdIdx).getReg();
- unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
-
- unsigned DefaultBankID = Size1 == 1 ?
- AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
- unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
-
- OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
-
- for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
- const MachineOperand &MO = MI.getOperand(OpdIdx);
- if (!MO.isReg())
+ // Even though we technically could use SGPRs, this would require knowledge of
+ // the constant bus restriction. Force all sources to VGPR (except for VCC).
+ //
+ // TODO: Unary ops are trivially OK, so accept SGPRs?
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &Src = MI.getOperand(i);
+ if (!Src.isReg())
continue;
- unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
+ unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
- OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
+ OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -2324,6 +3200,10 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
continue;
Register OpReg = MI.getOperand(I).getReg();
+ // We replace some dead address operands with $noreg
+ if (!OpReg)
+ continue;
+
unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
// FIXME: Probably need a new intrinsic register bank searchable table to
@@ -2345,6 +3225,22 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
}
+/// Return the mapping for a pointer arugment.
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
+ Register PtrReg) const {
+ LLT PtrTy = MRI.getType(PtrReg);
+ unsigned Size = PtrTy.getSizeInBits();
+ if (Subtarget.useFlatForGlobal() ||
+ !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
+ return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+
+ // If we're using MUBUF instructions for global memory, an SGPR base register
+ // is possible. Otherwise this needs to be a VGPR.
+ const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+ return AMDGPU::getValueMapping(PtrBank->getID(), Size);
+}
+
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
@@ -2352,7 +3248,6 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 2> OpdsMapping(2);
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
- LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
Register PtrReg = MI.getOperand(1).getReg();
LLT PtrTy = MRI.getType(PtrReg);
unsigned AS = PtrTy.getAddressSpace();
@@ -2364,14 +3259,23 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
if (PtrBank == &AMDGPU::SGPRRegBank &&
- (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
- AS != AMDGPUAS::PRIVATE_ADDRESS) &&
- isScalarLoadLegal(MI)) {
- // We have a uniform instruction so we want to use an SMRD load
- ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
- PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
+ SITargetLowering::isFlatGlobalAddrSpace(AS)) {
+ if (isScalarLoadLegal(MI)) {
+ // We have a uniform instruction so we want to use an SMRD load
+ ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
+ } else {
+ ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+
+ // If we're using MUBUF instructions for global memory, an SGPR base
+ // register is possible. Otherwise this needs to be a VGPR.
+ unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
+ AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
+
+ PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
+ }
} else {
- ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
+ ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
@@ -2449,11 +3353,35 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
/// VGPR to SGPR generated is illegal.
///
+// Operands that must be SGPRs must accept potentially divergent VGPRs as
+// legal. These will be dealt with in applyMappingImpl.
+//
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (MI.isCopy()) {
+ // The default logic bothers to analyze impossible alternative mappings. We
+ // want the most straightforward mapping, so just directly handle this.
+ const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
+ *TRI);
+ const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
+ *TRI);
+ assert(SrcBank && "src bank should have been assigned already");
+ if (!DstBank)
+ DstBank = SrcBank;
+
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ if (cannotCopy(*DstBank, *SrcBank, Size))
+ return getInvalidInstructionMapping();
+
+ const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
+ return getInstructionMapping(
+ 1, /*Cost*/ 1,
+ /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+ }
+
if (MI.isRegSequence()) {
// If any input is a VGPR, the result must be a VGPR. The default handling
// assumes any copy between banks is legal.
@@ -2592,6 +3520,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLVM_FALLTHROUGH;
}
case AMDGPU::G_PTR_ADD:
+ case AMDGPU::G_PTRMASK:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
@@ -2608,6 +3537,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX:
+ case AMDGPU::G_SHUFFLE_VECTOR:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
@@ -2635,7 +3565,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
+ case AMDGPU::G_FSHR: // TODO: Expand for scalar
case AMDGPU::G_AMDGPU_FFBH_U32:
+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+ case AMDGPU::G_AMDGPU_RCP_IFLAG:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
@@ -2664,6 +3603,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
break;
}
+ case AMDGPU::G_DYN_STACKALLOC: {
+ // Result is always uniform, and a wave reduction is needed for the source.
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
+ break;
+ }
case AMDGPU::G_INSERT: {
unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
AMDGPU::VGPRRegBankID;
@@ -2719,12 +3665,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_BITCAST:
case AMDGPU::G_INTTOPTR:
case AMDGPU::G_PTRTOINT:
- case AMDGPU::G_CTLZ:
- case AMDGPU::G_CTLZ_ZERO_UNDEF:
- case AMDGPU::G_CTTZ:
- case AMDGPU::G_CTTZ_ZERO_UNDEF:
- case AMDGPU::G_CTPOP:
- case AMDGPU::G_BSWAP:
case AMDGPU::G_BITREVERSE:
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
@@ -2733,21 +3673,33 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
break;
}
+ case AMDGPU::G_CTLZ_ZERO_UNDEF:
+ case AMDGPU::G_CTTZ_ZERO_UNDEF:
+ case AMDGPU::G_CTPOP: {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
+
+ // This should really be getValueMappingSGPR64Only, but allowing the generic
+ // code to handle the register split just makes using LegalizerHelper more
+ // difficult.
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+ break;
+ }
case AMDGPU::G_TRUNC: {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
unsigned Bank = getRegBankID(Src, MRI, *TRI);
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
- OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ?
- AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) :
- AMDGPU::getValueMapping(Bank, DstSize);
+ OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
case AMDGPU::G_ZEXT:
case AMDGPU::G_SEXT:
- case AMDGPU::G_ANYEXT: {
+ case AMDGPU::G_ANYEXT:
+ case AMDGPU::G_SEXT_INREG: {
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
@@ -2765,17 +3717,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
- // TODO: Should anyext be split into 32-bit part as well?
- if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
- OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
- } else {
- // Scalar extend can use 64-bit BFE, but VGPRs require extending to
- // 32-bits, and then to 64.
- OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
- OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
- SrcSize);
- }
+ // Scalar extend can use 64-bit BFE, but VGPRs require extending to
+ // 32-bits, and then to 64.
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
+ SrcSize);
break;
}
case AMDGPU::G_FCMP: {
@@ -2790,43 +3736,43 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_STORE: {
assert(MI.getOperand(0).isReg());
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- // FIXME: We need to specify a different reg bank once scalar stores
- // are supported.
+
+ // FIXME: We need to specify a different reg bank once scalar stores are
+ // supported.
const ValueMapping *ValMapping =
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
- // FIXME: Depending on the type of store, the pointer could be in
- // the SGPR Reg bank.
- // FIXME: Pointer size should be based on the address space.
- const ValueMapping *PtrMapping =
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
-
OpdsMapping[0] = ValMapping;
- OpdsMapping[1] = PtrMapping;
+ OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
break;
}
-
case AMDGPU::G_ICMP: {
auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+
+ // See if the result register has already been constrained to vcc, which may
+ // happen due to control flow intrinsic lowering.
+ unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
- bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
+ bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
+ Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID &&
(Size == 32 || (Size == 64 &&
(Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
Subtarget.hasScalarCompareEq64()));
- unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
+ DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
+ unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
// TODO: Use 32-bit for scalar output size.
// SCC results will need to be copied to a 32-bit SGPR virtual register.
const unsigned ResultSize = 1;
- OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize);
- OpdsMapping[1] = nullptr; // Predicate Operand.
- OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
- OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
break;
}
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
@@ -2852,15 +3798,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
- unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
MRI, *TRI);
unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
- OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
- InsertSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
+
+ // This is a weird case, because we need to break down the mapping based on
+ // the register bank of a different operand.
+ if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
+ OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
+ InsertSize);
+ } else {
+ assert(InsertSize == 32 || InsertSize == 64);
+ OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
+ }
// The index can be either if the source vector is VGPR.
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
@@ -2878,6 +3831,116 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
+ case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
+ case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
+ case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ // rsrc
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+ // vindex
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+ // voffset
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+ // soffset
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+ // Any remaining operands are immediates and were correctly null
+ // initialized.
+ break;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+ // vdata_out
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ // vdata_in
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+ // rsrc
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+ // vindex
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+ // voffset
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+ // soffset
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+
+ // Any remaining operands are immediates and were correctly null
+ // initialized.
+ break;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
+ // vdata_out
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ // vdata_in
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+ // cmp
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+ // rsrc
+ OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+ // vindex
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+ // voffset
+ OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+
+ // soffset
+ OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
+
+ // Any remaining operands are immediates and were correctly null
+ // initialized.
+ break;
+ }
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ // Lie and claim everything is legal, even though some need to be
+ // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+ // We need to convert this to a MUBUF if either the resource of offset is
+ // VGPR.
+ unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
+ unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
+ unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
+
+ unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
+ break;
+ }
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
default:
@@ -2890,9 +3953,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_log_clamp:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_sqrt:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_ldexp:
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp:
@@ -2911,8 +3976,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_fmad_ftz:
case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi:
- case Intrinsic::amdgcn_ubfe:
- case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_mul_u24:
case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_lerp:
@@ -2933,13 +3996,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_udot4:
case Intrinsic::amdgcn_sdot8:
case Intrinsic::amdgcn_udot8:
- case Intrinsic::amdgcn_wwm:
- case Intrinsic::amdgcn_wqm:
+ return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_sbfe:
+ case Intrinsic::amdgcn_ubfe:
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_ds_swizzle:
case Intrinsic::amdgcn_ds_permute:
case Intrinsic::amdgcn_ds_bpermute:
case Intrinsic::amdgcn_update_dpp:
+ case Intrinsic::amdgcn_mov_dpp8:
+ case Intrinsic::amdgcn_mov_dpp:
+ case Intrinsic::amdgcn_wwm:
+ case Intrinsic::amdgcn_wqm:
+ case Intrinsic::amdgcn_softwqm:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
@@ -2954,26 +4025,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
= AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_s_buffer_load: {
- // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
- Register RSrc = MI.getOperand(2).getReg(); // SGPR
- Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
-
- unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
- unsigned Size3 = MRI.getType(Offset).getSizeInBits();
-
- unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
- unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
-
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
- OpdsMapping[1] = nullptr; // intrinsic id
-
- // Lie and claim everything is legal, even though some need to be
- // SGPRs. applyMapping will have to deal with it as a waterfall loop.
- OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
- OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
- OpdsMapping[4] = nullptr;
+ case Intrinsic::amdgcn_ps_live: {
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
}
case Intrinsic::amdgcn_div_scale: {
@@ -2983,11 +4036,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
- OpdsMapping[3] = AMDGPU::getValueMapping(
- getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
- OpdsMapping[4] = AMDGPU::getValueMapping(
- getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
-
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
break;
}
case Intrinsic::amdgcn_class: {
@@ -2997,10 +4047,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
- OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
- Src0Size);
- OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
- Src1Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
break;
}
case Intrinsic::amdgcn_icmp:
@@ -3009,10 +4057,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// This is not VCCRegBank because this is not used in boolean contexts.
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
- OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
- OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
break;
}
case Intrinsic::amdgcn_readlane: {
@@ -3054,6 +4100,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
@@ -3086,9 +4142,46 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1_f16:
+ case Intrinsic::amdgcn_interp_p2_f16: {
+ const int M0Idx = MI.getNumOperands() - 1;
+ Register M0Reg = MI.getOperand(M0Idx).getReg();
+ unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+ // Must be SGPR, but we must take whatever the original bank is and fix it
+ // later.
+ OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
+ break;
+ }
}
break;
}
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ auto IntrID = MI.getIntrinsicID();
+ const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
+ assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
+ // Non-images can have complications from operands that allow both SGPR
+ // and VGPR. For now it's too complicated to figure out the final opcode
+ // to derive the register bank from the MCInstrDesc.
+ assert(RSrcIntrin->IsImage);
+ return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
+ }
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
@@ -3100,13 +4193,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_ds_append:
- case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
@@ -3118,17 +4207,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
break;
}
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_exp_compr:
- OpdsMapping[0] = nullptr; // IntrinsicID
- // FIXME: These are immediate values which can't be read from registers.
- OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- // FIXME: Could we support packed types here?
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
- // FIXME: These are immediate values which can't be read from registers.
- OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
case Intrinsic::amdgcn_exp:
// FIXME: Could we support packed types here?
@@ -3137,31 +4225,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
break;
- case Intrinsic::amdgcn_buffer_load: {
- Register RSrc = MI.getOperand(2).getReg(); // SGPR
- Register VIndex = MI.getOperand(3).getReg(); // VGPR
- Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
-
- unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
- unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
- unsigned Size4 = MRI.getType(Offset).getSizeInBits();
-
- unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
- unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
-
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
- OpdsMapping[1] = nullptr; // intrinsic id
-
- // Lie and claim everything is legal, even though some need to be
- // SGPRs. applyMapping will have to deal with it as a waterfall loop.
- OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
- OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
- OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
- OpdsMapping[5] = nullptr;
- OpdsMapping[6] = nullptr;
- break;
- }
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// This must be an SGPR, but accept a VGPR.
@@ -3170,8 +4233,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
- case Intrinsic::amdgcn_end_cf:
- case Intrinsic::amdgcn_init_exec: {
+ case Intrinsic::amdgcn_s_setreg: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
+ case Intrinsic::amdgcn_end_cf: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
@@ -3227,7 +4296,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_init_exec_from_input: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
case Intrinsic::amdgcn_ds_gws_init:
@@ -3251,15 +4319,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
default:
- if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
- AMDGPU::lookupRsrcIntrinsic(IntrID)) {
- // Non-images can have complications from operands that allow both SGPR
- // and VGPR. For now it's too complicated to figure out the final opcode
- // to derive the register bank from the MCInstrDesc.
- if (RSrcIntrin->IsImage)
- return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
- }
-
return getInvalidInstructionMapping();
}
break;
@@ -3319,9 +4378,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
case AMDGPU::G_ATOMICRMW_FADD:
- case AMDGPU::G_ATOMIC_CMPXCHG:
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
- return getDefaultMappingAllVGPR(MI);
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
+ case AMDGPU::G_AMDGPU_ATOMIC_INC:
+ case AMDGPU::G_AMDGPU_ATOMIC_DEC: {
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
+ case AMDGPU::G_ATOMIC_CMPXCHG: {
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ break;
}
case AMDGPU::G_BRCOND: {
unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 1ac7d3652a8b3..8f38ec4eeb3a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -69,13 +69,20 @@ public:
void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
unsigned OpIdx) const;
- bool applyMappingWideLoad(MachineInstr &MI,
- const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI) const;
+ bool applyMappingDynStackAlloc(MachineInstr &MI,
+ const OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const;
+ bool applyMappingLoad(MachineInstr &MI,
+ const OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI) const;
bool
applyMappingImage(MachineInstr &MI,
- const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ const OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI, int RSrcIdx) const;
+ bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
+
+ bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
+ bool Signed) const;
void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
@@ -91,6 +98,9 @@ public:
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+ const ValueMapping *getValueMappingForPtr(const MachineRegisterInfo &MRI,
+ Register Ptr) const;
+
const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr &MI) const;
@@ -168,6 +178,15 @@ public:
const InstructionMapping &
getInstrMapping(const MachineInstr &MI) const override;
+
+private:
+
+ bool foldExtractEltToCmpSelect(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ const OperandsMapper &OpdMapper) const;
+ bool foldInsertEltToCmpSelect(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ const OperandsMapper &OpdMapper) const;
};
} // End llvm namespace.
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index c495316c5bce0..9f6ebd00cd97b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024]
+ [SReg_LO16, SReg_32, SReg_64, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
+ [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
>;
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
def AGPRRegBank : RegisterBank <"AGPR",
- [AGPR_32, AReg_64, AReg_128, AReg_512, AReg_1024]
+ [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
deleted file mode 100644
index 9806e6b0714f6..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Parent TargetRegisterInfo class common to all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
-using namespace llvm;
-
-AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
-
-//===----------------------------------------------------------------------===//
-// Function handling callbacks - Functions are a seldom used feature of GPUS, so
-// they are not supported at this time.
-//===----------------------------------------------------------------------===//
-
-// Table of NumRegs sized pieces at every 32-bit offset.
-static const uint16_t SubRegFromChannelTable[][32] = {
- { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
- AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
- AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
- AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
- AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31
- },
- {
- AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4,
- AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8,
- AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
- AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16,
- AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,
- AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
- AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28,
- AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister
- },
- {
- AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
- AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
- AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
- AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
- AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
- AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
- AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
- AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
- },
- {
- AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
- AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
- AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
- AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
- AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
- AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
- AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
- AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
- }
-};
-
-// FIXME: TableGen should generate something to make this manageable for all
-// register classes. At a minimum we could use the opposite of
-// composeSubRegIndices and go up from the base 32-bit subreg.
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) {
- const unsigned NumRegIndex = NumRegs - 1;
-
- assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
- "Not implemented");
- assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
- return SubRegFromChannelTable[NumRegIndex][Channel];
-}
-
-void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
- MCRegAliasIterator R(Reg, this, true);
-
- for (; R.isValid(); ++R)
- Reserved.set(*R);
-}
-
-#define GET_REGINFO_TARGET_DESC
-#include "AMDGPUGenRegisterInfo.inc"
-
-// Forced to be here by one .inc
-const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
- const MachineFunction *MF) const {
- CallingConv::ID CC = MF->getFunction().getCallingConv();
- switch (CC) {
- case CallingConv::C:
- case CallingConv::Fast:
- case CallingConv::Cold:
- return CSR_AMDGPU_HighRegs_SaveList;
- default: {
- // Dummy to not crash RegisterClassInfo.
- static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
- return &NoCalleeSavedReg;
- }
- }
-}
-
-const MCPhysReg *
-SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
- return nullptr;
-}
-
-const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
- CallingConv::ID CC) const {
- switch (CC) {
- case CallingConv::C:
- case CallingConv::Fast:
- case CallingConv::Cold:
- return CSR_AMDGPU_HighRegs_RegMask;
- default:
- return nullptr;
- }
-}
-
-Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- const SIFrameLowering *TFI =
- MF.getSubtarget<GCNSubtarget>().getFrameLowering();
- const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
- : FuncInfo->getStackPtrOffsetReg();
-}
-
-const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
- return CSR_AMDGPU_AllVGPRs_RegMask;
-}
-
-const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
- return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
deleted file mode 100644
index 9e713ca804a11..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// TargetRegisterInfo interface that is implemented by all hw codegen
-/// targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
-
-#define GET_REGINFO_HEADER
-#include "AMDGPUGenRegisterInfo.inc"
-
-namespace llvm {
-
-class GCNSubtarget;
-class TargetInstrInfo;
-
-struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
- AMDGPURegisterInfo();
-
- /// \returns the sub reg enum value for the given \p Channel
- /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
- static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
-
- void reserveRegisterTuples(BitVector &, unsigned Reg) const;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
deleted file mode 100644
index ab71b7aa8a572..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Tablegen register definitions common to all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-let Namespace = "AMDGPU" in {
-
-foreach Index = 0-31 in {
- def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
-}
-
-}
-
-include "SIRegisterInfo.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 9a1e2fc42ed57..9c3d96de6d68a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -208,8 +208,8 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
#ifndef NDEBUG
bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
- VectorType *VT0 = dyn_cast<VectorType>(Ty0);
- VectorType *VT1 = dyn_cast<VectorType>(Ty1);
+ auto *VT0 = dyn_cast<FixedVectorType>(Ty0);
+ auto *VT1 = dyn_cast<FixedVectorType>(Ty1);
if (!VT0 || !VT1)
return false;
@@ -409,7 +409,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
DL->getTypeSizeInBits(Val->getType())) {
assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()),
- { 0, 1, 2 });
+ ArrayRef<int>{0, 1, 2});
}
Val = B.CreateBitCast(Val, EffectiveEltTy);
@@ -453,9 +453,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
PointerType *ArgType = cast<PointerType>(Arg.getType());
auto *EltTy = ArgType->getElementType();
- unsigned Align = Arg.getParamAlignment();
- if (Align == 0)
- Align = DL->getABITypeAlignment(EltTy);
+ const auto Align =
+ DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
Value *Val = B.CreateExtractValue(StubCall, RetIdx++);
Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8d70536ec21c5..bc68310b2f5ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -198,6 +198,7 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
def : SourceOfDivergence<int_amdgcn_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
@@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
@@ -247,6 +249,7 @@ def : SourceOfDivergence<int_amdgcn_permlanex16>;
def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
+def : SourceOfDivergence<int_amdgcn_writelane>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
@@ -270,5 +273,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
+// The dummy boolean output is divergent from the IR's perspective,
+// but the mask results are uniform. These produce a divergent and
+// uniform result, so the returned struct is collectively divergent.
+// isAlwaysUniform can override the extract of the uniform component.
+def : SourceOfDivergence<int_amdgcn_if>;
+def : SourceOfDivergence<int_amdgcn_else>;
+def : SourceOfDivergence<int_amdgcn_loop>;
+
foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 445e91092499a..213788ae0f67b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -59,13 +59,6 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
- // FIXME: I don't think think Evergreen has any useful support for
- // denormals, but should be checked. Should we issue a warning somewhere
- // if someone tries to enable these?
- if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- FP32Denormals = false;
- }
-
HasMulU24 = getGeneration() >= EVERGREEN;
HasMulI24 = hasCaymanISA();
@@ -76,9 +69,6 @@ GCNSubtarget &
GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
// Determine default and user-specified characteristics
- // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
- // enabled, but some instructions do not respect them and they run at the
- // double precision rate, so don't enable by default.
//
// We want to be able to turn these off, but making this a subtarget feature
// for SI has the unhelpful behavior that it unsets everything else if you
@@ -88,20 +78,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
// unset everything else if it is disabled
// Assuming ECC is enabled is the conservative default.
- SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
- // FIXME: I don't think think Evergreen has any useful support for
- // denormals, but should be checked. Should we issue a warning somewhere
- // if someone tries to enable these?
- if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- FullFS += "+fp64-fp16-denormals,";
- } else {
- FullFS += "-fp32-denormals,";
- }
-
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
// Disable mutually exclusive bits.
@@ -145,12 +126,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
}
// Don't crash on invalid devices.
- if (WavefrontSize == 0)
- WavefrontSize = 64;
+ if (WavefrontSizeLog2 == 0)
+ WavefrontSizeLog2 = 5;
HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
- if (DoesNotSupportXNACK && EnableXNACK) {
+ // Disable XNACK on targets where it is not enabled by default unless it is
+ // explicitly requested.
+ if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
ToggleFeature(AMDGPU::FeatureXNACK);
EnableXNACK = false;
}
@@ -170,8 +153,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
TargetTriple(TT),
Has16BitInsts(false),
HasMadMixInsts(false),
- FP32Denormals(false),
- FPExceptions(false),
+ HasMadMacF32Insts(false),
+ HasDsSrc2Insts(false),
HasSDWA(false),
HasVOP3PInsts(false),
HasMulI24(true),
@@ -182,7 +165,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
HasTrigReducedRange(false),
MaxWavesPerEU(10),
LocalMemorySize(0),
- WavefrontSize(0)
+ WavefrontSizeLog2(0)
{ }
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
@@ -196,9 +179,9 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
MaxPrivateElementSize(0),
FastFMAF32(false),
+ FastDenormalF32(false),
HalfRate64Ops(false),
- FP64FP16Denormals(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
CodeObjectV3(false),
@@ -224,6 +207,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
GFX8Insts(false),
GFX9Insts(false),
GFX10Insts(false),
+ GFX10_3Insts(false),
GFX7GFX8GFX9Insts(false),
SGPRInitBug(false),
HasSMemRealTime(false),
@@ -241,7 +225,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasDPP(false),
HasDPP8(false),
HasR128A16(false),
+ HasGFX10A16(false),
+ HasG16(false),
HasNSAEncoding(false),
+ GFX10_BEncoding(false),
HasDLInsts(false),
HasDot1Insts(false),
HasDot2Insts(false),
@@ -256,6 +243,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DoesNotSupportSRAMECC(false),
HasNoSdstCMPX(false),
HasVscnt(false),
+ HasGetWaveIdInst(false),
+ HasSMemTimeInst(false),
HasRegisterBanking(false),
HasVOP3Literal(false),
HasNoDataDepHazard(false),
@@ -287,6 +276,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+ InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
InstSelector.reset(new AMDGPUInstructionSelector(
@@ -325,18 +315,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
+// FIXME: Should return min,max range.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
- unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
- unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
- if (!WorkGroupsPerCu)
+ const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
+ const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
+ if (!MaxWorkGroupsPerCu)
return 0;
- unsigned MaxWaves = getMaxWavesPerEU();
- unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
- unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
- NumWaves = std::min(NumWaves, MaxWaves);
- NumWaves = std::max(NumWaves, 1u);
- return NumWaves;
+
+ const unsigned WaveSize = getWavefrontSize();
+
+ // FIXME: Do we need to account for alignment requirement of LDS rounding the
+ // size up?
+ // Compute restriction based on LDS usage
+ unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
+
+ // This can be queried with more LDS than is possible, so just assume the
+ // worst.
+ if (NumGroups == 0)
+ return 1;
+
+ NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
+
+ // Round to the number of waves.
+ const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+ unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
+
+ // Clamp to the maximum possible number of waves.
+ MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+
+ // FIXME: Needs to be a multiple of the group size?
+ //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+
+ assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
+ "computed invalid occupancy");
+ return MaxWaves;
}
unsigned
@@ -396,13 +409,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// number of waves per execution unit to values implied by requested
// minimum/maximum flat work group sizes.
unsigned MinImpliedByFlatWorkGroupSize =
- getMaxWavesPerEU(FlatWorkGroupSizes.second);
- bool RequestedFlatWorkGroupSize = false;
-
- if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
- Default.first = MinImpliedByFlatWorkGroupSize;
- RequestedFlatWorkGroupSize = true;
- }
+ getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
+ Default.first = MinImpliedByFlatWorkGroupSize;
+ bool RequestedFlatWorkGroupSize =
+ F.hasFnAttribute("amdgpu-flat-work-group-size");
// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -414,9 +424,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
// Make sure requested values do not violate subtarget's specifications.
if (Requested.first < getMinWavesPerEU() ||
- Requested.first > getMaxWavesPerEU())
- return Default;
- if (Requested.second > getMaxWavesPerEU())
+ Requested.second > getMaxWavesPerEU())
return Default;
// Make sure requested values are compatible with values implied by requested
@@ -497,12 +505,12 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
const DataLayout &DL = F.getParent()->getDataLayout();
uint64_t ExplicitArgBytes = 0;
- MaxAlign = Align::None();
+ MaxAlign = Align(1);
for (const Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- const Align Alignment(DL.getABITypeAlignment(ArgTy));
+ const Align Alignment = DL.getABITypeAlign(ArgTy);
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
MaxAlign = std::max(MaxAlign, Alignment);
@@ -622,13 +630,12 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
-unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
- unsigned LDSSize,
+unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs,
unsigned NumVGPRs) const {
unsigned Occupancy =
std::min(getMaxWavesPerEU(),
- getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+ getOccupancyWithLocalMemSize(LDSSize, F));
if (NumSGPRs)
Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
if (NumVGPRs)
@@ -716,20 +723,20 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
return MaxNumVGPRs;
}
-void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
- SDep &Dep) const {
+void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
+ int UseOpIdx, SDep &Dep) const {
if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
- !Src->isInstr() || !Dst->isInstr())
+ !Def->isInstr() || !Use->isInstr())
return;
- MachineInstr *SrcI = Src->getInstr();
- MachineInstr *DstI = Dst->getInstr();
+ MachineInstr *DefI = Def->getInstr();
+ MachineInstr *UseI = Use->getInstr();
- if (SrcI->isBundle()) {
+ if (DefI->isBundle()) {
const SIRegisterInfo *TRI = getRegisterInfo();
auto Reg = Dep.getReg();
- MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
- MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
+ MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
+ MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
unsigned Lat = 0;
for (++I; I != E && I->isBundledWithPred(); ++I) {
if (I->modifiesRegister(Reg, TRI))
@@ -738,12 +745,12 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
--Lat;
}
Dep.setLatency(Lat);
- } else if (DstI->isBundle()) {
+ } else if (UseI->isBundle()) {
const SIRegisterInfo *TRI = getRegisterInfo();
auto Reg = Dep.getReg();
- MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
- MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
- unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
+ MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
+ MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
+ unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
if (I->readsRegister(Reg, TRI))
break;
@@ -754,53 +761,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
}
namespace {
-struct MemOpClusterMutation : ScheduleDAGMutation {
- const SIInstrInfo *TII;
-
- MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
-
- void apply(ScheduleDAGInstrs *DAG) override {
- SUnit *SUa = nullptr;
- // Search for two consequent memory operations and link them
- // to prevent scheduler from moving them apart.
- // In DAG pre-process SUnits are in the original order of
- // the instructions before scheduling.
- for (SUnit &SU : DAG->SUnits) {
- MachineInstr &MI2 = *SU.getInstr();
- if (!MI2.mayLoad() && !MI2.mayStore()) {
- SUa = nullptr;
- continue;
- }
- if (!SUa) {
- SUa = &SU;
- continue;
- }
-
- MachineInstr &MI1 = *SUa->getInstr();
- if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
- (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
- (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
- (TII->isDS(MI1) && TII->isDS(MI2))) {
- SU.addPredBarrier(SUa);
-
- for (const SDep &SI : SU.Preds) {
- if (SI.getSUnit() != SUa)
- SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
- }
-
- if (&SU != &DAG->ExitSU) {
- for (const SDep &SI : SUa->Succs) {
- if (SI.getSUnit() != &SU)
- SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
- }
- }
- }
-
- SUa = &SU;
- }
- }
-};
-
struct FillMFMAShadowMutation : ScheduleDAGMutation {
const SIInstrInfo *TII;
@@ -927,7 +887,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 19a240800ba14..c833bfbcf9366 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600FrameLowering.h"
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
@@ -24,6 +25,7 @@
#include "SIInstrInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -65,8 +67,8 @@ private:
protected:
bool Has16BitInsts;
bool HasMadMixInsts;
- bool FP32Denormals;
- bool FPExceptions;
+ bool HasMadMacF32Insts;
+ bool HasDsSrc2Insts;
bool HasSDWA;
bool HasVOP3PInsts;
bool HasMulI24;
@@ -77,7 +79,7 @@ protected:
bool HasTrigReducedRange;
unsigned MaxWavesPerEU;
int LocalMemorySize;
- unsigned WavefrontSize;
+ char WavefrontSizeLog2;
public:
AMDGPUSubtarget(const Triple &TT);
@@ -140,6 +142,10 @@ public:
return isAmdHsaOS() || isMesaKernel(F);
}
+ bool isGCN() const {
+ return TargetTriple.getArch() == Triple::amdgcn;
+ }
+
bool has16BitInsts() const {
return Has16BitInsts;
}
@@ -148,17 +154,12 @@ public:
return HasMadMixInsts;
}
- bool hasFP32Denormals(const Function &F) const {
- // FIXME: This should not be a property of the subtarget. This should be a
- // property with a default set by the calling convention which can be
- // overridden by attributes. For now, use the subtarget feature as a
- // placeholder attribute. The function arguments only purpose is to
- // discourage use without a function context until this is removed.
- return FP32Denormals;
+ bool hasMadMacF32Insts() const {
+ return HasMadMacF32Insts || !isGCN();
}
- bool hasFPExceptions() const {
- return FPExceptions;
+ bool hasDsSrc2Insts() const {
+ return HasDsSrc2Insts;
}
bool hasSDWA() const {
@@ -194,7 +195,11 @@ public:
}
unsigned getWavefrontSize() const {
- return WavefrontSize;
+ return 1 << WavefrontSizeLog2;
+ }
+
+ unsigned getWavefrontSizeLog2() const {
+ return WavefrontSizeLog2;
}
int getLocalMemorySize() const {
@@ -221,9 +226,10 @@ public:
/// \returns Maximum flat work group size supported by the subtarget.
virtual unsigned getMaxFlatWorkGroupSize() const = 0;
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0;
+ /// \returns Number of waves per execution unit required to support the given
+ /// \p FlatWorkGroupSize.
+ virtual unsigned
+ getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
/// \returns Minimum number of waves per execution unit supported by the
/// subtarget.
@@ -246,6 +252,13 @@ public:
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
+ /// \returns Corresponsing DWARF register number mapping flavour for the
+ /// \p WavefrontSize.
+ AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const {
+ return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
+ : AMDGPUDwarfFlavour::Wave64;
+ }
+
virtual ~AMDGPUSubtarget() {}
};
@@ -278,6 +291,7 @@ public:
private:
/// GlobalISel related APIs.
std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
std::unique_ptr<InstructionSelector> InstSelector;
std::unique_ptr<LegalizerInfo> Legalizer;
std::unique_ptr<RegisterBankInfo> RegBankInfo;
@@ -292,10 +306,10 @@ protected:
// Possibly statically set by tablegen, but may want to be overridden.
bool FastFMAF32;
+ bool FastDenormalF32;
bool HalfRate64Ops;
// Dynamially set bits that enable features.
- bool FP64FP16Denormals;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
bool CodeObjectV3;
@@ -325,6 +339,7 @@ protected:
bool GFX8Insts;
bool GFX9Insts;
bool GFX10Insts;
+ bool GFX10_3Insts;
bool GFX7GFX8GFX9Insts;
bool SGPRInitBug;
bool HasSMemRealTime;
@@ -342,7 +357,10 @@ protected:
bool HasDPP;
bool HasDPP8;
bool HasR128A16;
+ bool HasGFX10A16;
+ bool HasG16;
bool HasNSAEncoding;
+ bool GFX10_BEncoding;
bool HasDLInsts;
bool HasDot1Insts;
bool HasDot2Insts;
@@ -357,6 +375,8 @@ protected:
bool DoesNotSupportSRAMECC;
bool HasNoSdstCMPX;
bool HasVscnt;
+ bool HasGetWaveIdInst;
+ bool HasSMemTimeInst;
bool HasRegisterBanking;
bool HasVOP3Literal;
bool HasNoDataDepHazard;
@@ -426,6 +446,10 @@ public:
return CallLoweringInfo.get();
}
+ const InlineAsmLowering *getInlineAsmLowering() const override {
+ return InlineAsmLoweringInfo.get();
+ }
+
InstructionSelector *getInstructionSelector() const override {
return InstSelector.get();
}
@@ -453,10 +477,6 @@ public:
return (Generation)Gen;
}
- unsigned getWavefrontSizeLog2() const {
- return Log2_32(WavefrontSize);
- }
-
/// Return the number of high bits known to be zero fror a frame index.
unsigned getKnownHighZeroBitsForFrameIndex() const {
return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
@@ -506,6 +526,10 @@ public:
return getGeneration() >= VOLCANIC_ISLANDS;
}
+ bool hasFractBug() const {
+ return getGeneration() == SOUTHERN_ISLANDS;
+ }
+
bool hasBFE() const {
return true;
}
@@ -587,6 +611,11 @@ public:
return getGeneration() <= SEA_ISLANDS;
}
+ /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
+ bool partialVCCWritesUpdateVCCZ() const {
+ return getGeneration() >= GFX10;
+ }
+
/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
/// was written by a VALU instruction.
bool hasSMRDReadVALUDefHazard() const {
@@ -617,20 +646,6 @@ public:
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;
- /// Alias for hasFP64FP16Denormals
- bool hasFP16Denormals(const Function &F) const {
- return FP64FP16Denormals;
- }
-
- /// Alias for hasFP64FP16Denormals
- bool hasFP64Denormals(const Function &F) const {
- return FP64FP16Denormals;
- }
-
- bool hasFP64FP16Denormals(const Function &F) const {
- return FP64FP16Denormals;
- }
-
bool supportsMinMaxDenormModes() const {
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
@@ -724,6 +739,18 @@ public:
return ScalarFlatScratchInsts;
}
+ bool hasGlobalAddTidInsts() const {
+ return GFX10_BEncoding;
+ }
+
+ bool hasAtomicCSub() const {
+ return GFX10_BEncoding;
+ }
+
+ bool hasMultiDwordFlatScratchAddressing() const {
+ return getGeneration() >= GFX9;
+ }
+
bool hasFlatSegmentOffsetBug() const {
return HasFlatSegmentOffsetBug;
}
@@ -853,6 +880,14 @@ public:
return HasVscnt;
}
+ bool hasGetWaveIdInst() const {
+ return HasGetWaveIdInst;
+ }
+
+ bool hasSMemTimeInst() const {
+ return HasSMemTimeInst;
+ }
+
bool hasRegisterBanking() const {
return HasRegisterBanking;
}
@@ -890,30 +925,6 @@ public:
void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
- /// \returns Number of execution units per compute unit supported by the
- /// subtarget.
- unsigned getEUsPerCU() const {
- return AMDGPU::IsaInfo::getEUsPerCU(this);
- }
-
- /// \returns Maximum number of waves per compute unit supported by the
- /// subtarget without any kind of limitation.
- unsigned getMaxWavesPerCU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
- }
-
- /// \returns Maximum number of waves per compute unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
- }
-
- /// \returns Number of waves per work group supported by the subtarget and
- /// limited by given \p FlatWorkGroupSize.
- unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
- return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
- }
-
// static wrappers
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
@@ -979,6 +990,14 @@ public:
return HasR128A16;
}
+ bool hasGFX10A16() const {
+ return HasGFX10A16;
+ }
+
+ bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
+
+ bool hasG16() const { return HasG16; }
+
bool hasOffset3fBug() const {
return HasOffset3fBug;
}
@@ -987,6 +1006,14 @@ public:
return HasNSAEncoding;
}
+ bool hasGFX10_BEncoding() const {
+ return GFX10_BEncoding;
+ }
+
+ bool hasGFX10_3Insts() const {
+ return GFX10_3Insts;
+ }
+
bool hasMadF16() const;
bool enableSIScheduler() const {
@@ -1059,6 +1086,8 @@ public:
return HasNSAtoVMEMBug;
}
+ bool hasHardClauses() const { return getGeneration() >= GFX10; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1071,7 +1100,7 @@ public:
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
/// we do not have enough information to compute it.
- unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
+ unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
/// \returns true if the flat_scratch register should be initialized with the
@@ -1178,7 +1207,7 @@ public:
const override;
bool isWave32() const {
- return WavefrontSize == 32;
+ return getWavefrontSize() == 32;
}
const TargetRegisterClass *getBoolRC() const {
@@ -1201,10 +1230,11 @@ public:
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
}
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+ /// \returns Number of waves per execution unit required to support the given
+ /// \p FlatWorkGroupSize.
+ unsigned
+ getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
}
/// \returns Minimum number of waves per execution unit supported by the
@@ -1213,7 +1243,8 @@ public:
return AMDGPU::IsaInfo::getMinWavesPerEU(this);
}
- void adjustSchedDependency(SUnit *Src, SUnit *Dst, SDep &Dep) const override;
+ void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
+ SDep &Dep) const override;
};
class R600Subtarget final : public R600GenSubtargetInfo,
@@ -1338,10 +1369,11 @@ public:
return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
}
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget and limited by given \p FlatWorkGroupSize.
- unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
- return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+ /// \returns Number of waves per execution unit required to support the given
+ /// \p FlatWorkGroupSize.
+ unsigned
+ getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+ return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
}
/// \returns Minimum number of waves per execution unit supported by the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eb30d659bf0b5..b4b10835837cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUCallLowering.h"
+#include "AMDGPUExportClustering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPUMacroFusion.h"
@@ -23,6 +24,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "R600MachineScheduler.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
@@ -30,6 +32,7 @@
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/Passes.h"
@@ -138,6 +141,13 @@ static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
cl::init(true),
cl::Hidden);
+static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
+ "amdgpu-fixed-function-abi",
+ cl::desc("Enable all implicit function arguments"),
+ cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
+ cl::init(false),
+ cl::Hidden);
+
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
@@ -183,6 +193,11 @@ static cl::opt<bool> EnableScalarIRPasses(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableStructurizerWorkarounds(
+ "amdgpu-enable-structurizer-workarounds",
+ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -217,23 +232,29 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+ initializeAMDGPUPostLegalizerCombinerPass(*PR);
+ initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
+ initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
+ initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIRemoveShortExecBranchesPass(*PR);
+ initializeSIPreEmitPeepholePass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
+ initializeSIPostRABundlerPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -243,6 +264,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
+ initializeSIAddIMGInitPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -264,6 +286,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+ DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
}
@@ -363,10 +386,17 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
+ if (TT.getArch() == Triple::amdgcn) {
+ if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
+ MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
+ else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
+ MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
+ }
}
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
@@ -416,20 +446,19 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
}
PM.add(createAMDGPUUnifyMetadataPass());
PM.add(createAMDGPUPrintfRuntimeBinding());
- PM.add(createAMDGPUPropagateAttributesLatePass(this));
- if (Internalize) {
+ if (Internalize)
PM.add(createInternalizePass(mustPreserveGV));
+ PM.add(createAMDGPUPropagateAttributesLatePass(this));
+ if (Internalize)
PM.add(createGlobalDCEPass());
- }
if (EarlyInline)
PM.add(createAMDGPUAlwaysInlinePass(false));
});
- const auto &Opt = Options;
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
- [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
- legacy::PassManagerBase &PM) {
+ [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
+ legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
@@ -437,12 +466,12 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)
- PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
+ PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
});
Builder.addExtension(
PassManagerBuilder::EP_CGSCCOptimizerLate,
- [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+ [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());
@@ -450,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
// This should run after inlining to have any chance of doing anything,
// and before other cleanup optimizations.
PM.add(createAMDGPULowerKernelAttributesPass());
+
+ // Promote alloca to vector before SROA and loop unroll. If we manage
+ // to eliminate allocas before unroll we may choose to unroll less.
+ if (EnableOpt)
+ PM.add(createAMDGPUPromoteAllocaToVector());
});
}
@@ -617,7 +651,9 @@ public:
bool addILPOpts() override;
bool addInstSelector() override;
bool addIRTranslator() override;
+ void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
+ void addPreRegBankSelect() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
@@ -751,10 +787,15 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (EnableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
+
+ // LowerSwitch pass may introduce unreachable blocks that can
+ // cause unexpected behavior for subsequent passes. Placing it
+ // here seems better that these blocks would get cleaned up by
+ // UnreachableBlockElim inserted next in the pass flow.
+ addPass(createLowerSwitchPass());
}
bool AMDGPUPassConfig::addPreISel() {
- addPass(createLowerSwitchPass());
addPass(createFlattenCFGPass());
return false;
}
@@ -836,7 +877,11 @@ bool GCNPassConfig::addPreISel() {
// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);
if (!LateCFGStructurize) {
- addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ if (EnableStructurizerWorkarounds) {
+ addPass(createFixIrreduciblePass());
+ addPass(createUnifyLoopExitsPass());
+ }
+ addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
}
addPass(createSinkingPass());
addPass(createAMDGPUAnnotateUniformValues());
@@ -885,6 +930,12 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
+ // TODO: We have to add FinalizeISel
+ // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
+ // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
+ // Will be removed as soon as SIFixupVectorISel is changed
+ // to work with V_ADD/SUB_U64_PSEUDO instead.
+ addPass(&FinalizeISelID);
addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
@@ -895,11 +946,22 @@ bool GCNPassConfig::addIRTranslator() {
return false;
}
+void GCNPassConfig::addPreLegalizeMachineIR() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
+ addPass(new Localizer());
+}
+
bool GCNPassConfig::addLegalizeMachineIR() {
addPass(new Legalizer());
return false;
}
+void GCNPassConfig::addPreRegBankSelect() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+}
+
bool GCNPassConfig::addRegBankSelect() {
addPass(new RegBankSelect());
return false;
@@ -933,12 +995,9 @@ void GCNPassConfig::addFastRegAlloc() {
}
void GCNPassConfig::addOptimizedRegAlloc() {
- if (OptExecMaskPreRA) {
+ if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
- insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
- } else {
- insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
- }
+ insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
@@ -973,6 +1032,7 @@ void GCNPassConfig::addPostRegAlloc() {
}
void GCNPassConfig::addPreSched2() {
+ addPass(&SIPostRABundlerID);
}
void GCNPassConfig::addPreEmitPass() {
@@ -993,9 +1053,12 @@ void GCNPassConfig::addPreEmitPass() {
// FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
// be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIInsertHardClausesID);
addPass(&SIRemoveShortExecBranchesID);
addPass(&SIInsertSkipsPassID);
+ addPass(&SIPreEmitPeepholeID);
addPass(&BranchRelaxationPassID);
}
@@ -1024,11 +1087,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->initializeBaseYamlFields(YamlMFI);
- auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
- if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
+ auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
+ Register TempReg;
+ if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
SourceRange = RegName.SourceRange;
return true;
}
+ RegVal = TempReg;
return false;
};
@@ -1046,7 +1111,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
};
if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
- parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
return true;
@@ -1056,11 +1120,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
}
- if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
- !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
- return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
- }
-
if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
!AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
@@ -1080,7 +1139,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return false;
if (A->IsRegister) {
- unsigned Reg;
+ Register Reg;
if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
SourceRange = A->RegisterName.SourceRange;
return true;
@@ -1154,8 +1213,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
- MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals;
- MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals;
+ MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
+ MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
+ MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
+ MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 70fa3961236f2..e223fecc88195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -39,6 +39,7 @@ protected:
public:
static bool EnableLateStructurizeCFG;
static bool EnableFunctionCalls;
+ static bool EnableFixedFunctionABI;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
@@ -56,8 +57,9 @@ public:
void adjustPassManager(PassManagerBuilder &) override;
/// Get the integer value of a null pointer in the given address space.
- uint64_t getNullPointerValue(unsigned AddrSpace) const {
+ static int64_t getNullPointerValue(unsigned AddrSpace) {
return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
}
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index 819bebb7932d7..ed564ec1ad547 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -15,9 +15,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
-#include "AMDGPU.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetMachine.h"
namespace llvm {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c4eeb81c5133e..542a5f006c0f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -69,6 +69,21 @@ static cl::opt<unsigned> UnrollThresholdIf(
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
cl::init(150), cl::Hidden);
+static cl::opt<bool> UnrollRuntimeLocal(
+ "amdgpu-unroll-runtime-local",
+ cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool> UseLegacyDA(
+ "amdgpu-use-legacy-divergence-analysis",
+ cl::desc("Enable legacy divergence analysis for AMDGPU"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
+ "amdgpu-unroll-max-block-to-analyze",
+ cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
+ cl::init(20), cl::Hidden);
+
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
unsigned Depth = 0) {
const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -172,6 +187,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
(!isa<GlobalVariable>(GEP->getPointerOperand()) &&
!isa<Argument>(GEP->getPointerOperand())))
continue;
+ LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
+ << *L << " due to LDS use.\n");
+ UP.Runtime = UnrollRuntimeLocal;
}
// Check if GEP depends on a value defined by this loop itself.
@@ -210,13 +228,22 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (UP.Threshold >= MaxBoost)
return;
}
+
+ // If we got a GEP in a small BB from inner loop then increase max trip
+ // count to analyze for better estimation cost in unroll
+ if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
+ UP.MaxIterationsCountToAnalyze = 32;
}
}
+void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ BaseT::getPeelingPreferences(L, SE, PP);
+}
unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
// The concept of vector registers doesn't really exist. Some packed vector
// operations operate on the normal 32-bit registers.
- return 256;
+ return MaxVGPRs;
}
unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
@@ -225,6 +252,13 @@ unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
return getHardwareNumberOfRegisters(Vec) >> 3;
}
+unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+ const TargetRegisterClass *RC = TRI->getRegClass(RCID);
+ unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
+ return getHardwareNumberOfRegisters(false) / NumVGPRs;
+}
+
unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
return 32;
}
@@ -234,8 +268,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
}
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
- unsigned ChainSizeInBytes,
- VectorType *VecTy) const {
+ unsigned ChainSizeInBytes,
+ VectorType *VecTy) const {
unsigned VecRegBitWidth = VF * LoadSize;
if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
// TODO: Support element-size less than 32bit?
@@ -262,20 +296,16 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
return 512;
}
- if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
- AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- AddrSpace == AMDGPUAS::REGION_ADDRESS)
- return 128;
-
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
return 8 * ST->getMaxPrivateElementSize();
- llvm_unreachable("unhandled address space");
+ // Common to flat, global, local and region. Assume for unknown addrspace.
+ return 128;
}
bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
- unsigned AddrSpace) const {
+ Align Alignment,
+ unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context
// here, and legalization can handle it.
@@ -287,17 +317,87 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
}
bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
- unsigned AddrSpace) const {
+ Align Alignment,
+ unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
- unsigned AddrSpace) const {
+ Align Alignment,
+ unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
+// FIXME: Really we would like to issue multiple 128-bit loads and stores per
+// iteration. Should we report a larger size and let it legalize?
+//
+// FIXME: Should we use narrower types for local/region, or account for when
+// unaligned access is legal?
+//
+// FIXME: This could use fine tuning and microbenchmarks.
+Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+ unsigned SrcAddrSpace,
+ unsigned DestAddrSpace,
+ unsigned SrcAlign,
+ unsigned DestAlign) const {
+ unsigned MinAlign = std::min(SrcAlign, DestAlign);
+
+ // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
+ // hardware into byte accesses. If you assume all alignments are equally
+ // probable, it's more efficient on average to use short accesses for this
+ // case.
+ if (MinAlign == 2)
+ return Type::getInt16Ty(Context);
+
+ // Not all subtargets have 128-bit DS instructions, and we currently don't
+ // form them by default.
+ if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
+ DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
+ return FixedVectorType::get(Type::getInt32Ty(Context), 2);
+ }
+
+ // Global memory works best with 16-byte accesses. Private memory will also
+ // hit this, although they'll be decomposed.
+ return FixedVectorType::get(Type::getInt32Ty(Context), 4);
+}
+
+void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
+ SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+ unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+ unsigned SrcAlign, unsigned DestAlign) const {
+ assert(RemainingBytes < 16);
+
+ unsigned MinAlign = std::min(SrcAlign, DestAlign);
+
+ if (MinAlign != 2) {
+ Type *I64Ty = Type::getInt64Ty(Context);
+ while (RemainingBytes >= 8) {
+ OpsOut.push_back(I64Ty);
+ RemainingBytes -= 8;
+ }
+
+ Type *I32Ty = Type::getInt32Ty(Context);
+ while (RemainingBytes >= 4) {
+ OpsOut.push_back(I32Ty);
+ RemainingBytes -= 4;
+ }
+ }
+
+ Type *I16Ty = Type::getInt16Ty(Context);
+ while (RemainingBytes >= 2) {
+ OpsOut.push_back(I16Ty);
+ RemainingBytes -= 2;
+ }
+
+ Type *I8Ty = Type::getInt8Ty(Context);
+ while (RemainingBytes) {
+ OpsOut.push_back(I8Ty);
+ --RemainingBytes;
+ }
+}
+
unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
@@ -339,6 +439,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind,
TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info,
TTI::OperandValueProperties Opd1PropInfo,
@@ -347,7 +448,11 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
const Instruction *CxtI) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ // FIXME: We're having to query the throughput cost so that the basic
+ // implementation tries to generate legalize and scalarization costs. Maybe
+ // we could hoist the scalarization code here?
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+ Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo);
}
@@ -455,24 +560,44 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * NElts * Cost;
}
break;
+ case ISD::FNEG:
+ // Use the backend' estimation. If fneg is not free each element will cost
+ // one additional instruction.
+ return TLI->isFNegFree(SLT) ? 0 : NElts;
default:
break;
}
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+ Opd2Info,
Opd1PropInfo, Opd2PropInfo);
}
-template <typename T>
-int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<T *> Args,
- FastMathFlags FMF, unsigned VF) {
- if (ID != Intrinsic::fma)
- return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+// Return true if there's a potential benefit from using v2f16 instructions for
+// an intrinsic, even if it requires nontrivial legalization.
+static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::fma: // TODO: fmuladd
+ // There's a small benefit to using vector ops in the legalized code.
+ case Intrinsic::round:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ if (ICA.getID() == Intrinsic::fabs)
+ return 0;
+ if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+ Type *RetTy = ICA.getReturnType();
EVT OrigTy = TLI->getValueType(DL, RetTy);
if (!OrigTy.isSimple()) {
- return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
// Legalize the type.
@@ -489,36 +614,34 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
- return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost()
- : getQuarterRateInstrCost());
-}
+ // TODO: Get more refined intrinsic costs?
+ unsigned InstRate = getQuarterRateInstrCost();
+ if (ICA.getID() == Intrinsic::fma) {
+ InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
+ : getQuarterRateInstrCost();
+ }
-int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value*> Args, FastMathFlags FMF,
- unsigned VF) {
- return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF);
+ return LT.first * NElts * InstRate;
}
-int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF,
- unsigned ScalarizationCostPassed) {
- return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF,
- ScalarizationCostPassed);
-}
+unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind) {
+ if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+ return Opcode == Instruction::PHI ? 0 : 1;
-unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
// XXX - For some reason this isn't called for switch.
switch (Opcode) {
case Instruction::Br:
case Instruction::Ret:
return 10;
default:
- return BaseT::getCFInstrCost(Opcode);
+ return BaseT::getCFInstrCost(Opcode, CostKind);
}
}
-int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
- bool IsPairwise) {
+int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ bool IsPairwise,
+ TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
@@ -526,15 +649,15 @@ int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
if (IsPairwise ||
!ST->hasVOP3PInsts() ||
OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getFullRateInstrCost();
}
-int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
- bool IsPairwise,
- bool IsUnsigned) {
+int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsPairwise, bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
@@ -542,7 +665,8 @@ int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
if (IsPairwise ||
!ST->hasVOP3PInsts() ||
OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
+ return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost();
@@ -573,8 +697,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
-
-
static bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();
@@ -601,6 +723,58 @@ static bool isArgPassedInSGPR(const Argument *A) {
}
}
+/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
+/// this is analyzing the collective result of all output registers. Otherwise,
+/// this is only querying a specific result index if this returns multiple
+/// registers in a struct.
+bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
+ const CallInst *CI, ArrayRef<unsigned> Indices) const {
+ // TODO: Handle complex extract indices
+ if (Indices.size() > 1)
+ return true;
+
+ const DataLayout &DL = CI->getModule()->getDataLayout();
+ const SIRegisterInfo *TRI = ST->getRegisterInfo();
+ TargetLowering::AsmOperandInfoVector TargetConstraints =
+ TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
+
+ const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
+
+ int OutputIdx = 0;
+ for (auto &TC : TargetConstraints) {
+ if (TC.Type != InlineAsm::isOutput)
+ continue;
+
+ // Skip outputs we don't care about.
+ if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
+ continue;
+
+ TLI->ComputeConstraintToUse(TC, SDValue());
+
+ Register AssignedReg;
+ const TargetRegisterClass *RC;
+ std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
+ TRI, TC.ConstraintCode, TC.ConstraintVT);
+ if (AssignedReg) {
+ // FIXME: This is a workaround for getRegForInlineAsmConstraint
+ // returning VS_32
+ RC = TRI->getPhysRegClass(AssignedReg);
+ }
+
+ // For AGPR constraints null is returned on subtargets without AGPRs, so
+ // assume divergent for null.
+ if (!RC || !TRI->isSGPRClass(RC))
+ return true;
+ }
+
+ return false;
+}
+
+/// \returns true if the new GPU divergence analysis is enabled.
+bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
+ return !UseLegacyDA;
+}
+
/// \returns true if the result of the value could potentially be
/// different across workitems in a wavefront.
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
@@ -628,7 +802,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
// Assume all function calls are a source of divergence.
- if (isa<CallInst>(V) || isa<InvokeInst>(V))
+ if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+ if (CI->isInlineAsm())
+ return isInlineAsmSourceOfDivergence(CI);
+ return true;
+ }
+
+ // Assume all function calls are a source of divergence.
+ if (isa<InvokeInst>(V))
return true;
return false;
@@ -643,9 +824,44 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_icmp:
case Intrinsic::amdgcn_fcmp:
+ case Intrinsic::amdgcn_ballot:
+ case Intrinsic::amdgcn_if_break:
return true;
}
}
+
+ if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+ if (CI->isInlineAsm())
+ return !isInlineAsmSourceOfDivergence(CI);
+ return false;
+ }
+
+ const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
+ if (!ExtValue)
+ return false;
+
+ const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
+ if (!CI)
+ return false;
+
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else: {
+ ArrayRef<unsigned> Indices = ExtValue->getIndices();
+ return Indices.size() == 1 && Indices[0] == 1;
+ }
+ }
+ }
+
+ // If we have inline asm returning mixed SGPR and VGPR results, we inferred
+ // divergent for the overall struct return. We need to override it in the
+ // case we're extracting an SGPR component here.
+ if (CI->isInlineAsm())
+ return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
+
return false;
}
@@ -666,8 +882,9 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
}
}
-bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
- IntrinsicInst *II, Value *OldV, Value *NewV) const {
+Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+ Value *OldV,
+ Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
@@ -677,7 +894,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
if (!IsVolatile->isZero())
- return false;
+ return nullptr;
Module *M = II->getParent()->getParent()->getParent();
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
@@ -685,7 +902,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
II->setArgOperand(0, NewV);
II->setCalledFunction(NewDecl);
- return true;
+ return II;
}
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
@@ -695,20 +912,49 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
LLVMContext &Ctx = NewV->getType()->getContext();
ConstantInt *NewVal = (TrueAS == NewAS) ?
ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
- II->replaceAllUsesWith(NewVal);
- II->eraseFromParent();
- return true;
+ return NewVal;
+ }
+ case Intrinsic::ptrmask: {
+ unsigned OldAS = OldV->getType()->getPointerAddressSpace();
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ Value *MaskOp = II->getArgOperand(1);
+ Type *MaskTy = MaskOp->getType();
+
+ bool DoTruncate = false;
+ if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
+ // All valid 64-bit to 32-bit casts work by chopping off the high
+ // bits. Any masking only clearing the low bits will also apply in the new
+ // address space.
+ if (DL.getPointerSizeInBits(OldAS) != 64 ||
+ DL.getPointerSizeInBits(NewAS) != 32)
+ return nullptr;
+
+ // TODO: Do we need to thread more context in here?
+ KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
+ if (Known.countMinLeadingOnes() < 32)
+ return nullptr;
+
+ DoTruncate = true;
+ }
+
+ IRBuilder<> B(II);
+ if (DoTruncate) {
+ MaskTy = B.getInt32Ty();
+ MaskOp = B.CreateTrunc(MaskOp, MaskTy);
+ }
+
+ return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
+ {NewV, MaskOp});
}
default:
- return false;
+ return nullptr;
}
}
-unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
+ int Index, VectorType *SubTp) {
if (ST->hasVOP3PInsts()) {
- VectorType *VT = cast<VectorType>(Tp);
- if (VT->getNumElements() == 2 &&
+ if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle is free.
@@ -724,7 +970,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
}
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
@@ -745,8 +991,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
// FIXME: dx10_clamp can just take the caller setting, but there seems to be
// no way to support merge for backend defined attributes.
- AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
- AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
+ AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
+ AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
return CallerMode.isInlineCompatible(CalleeMode);
}
@@ -755,117 +1001,9 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
CommonTTI.getUnrollingPreferences(L, SE, UP);
}
-unsigned GCNTTIImpl::getUserCost(const User *U,
- ArrayRef<const Value *> Operands) {
- const Instruction *I = dyn_cast<Instruction>(U);
- if (!I)
- return BaseT::getUserCost(U, Operands);
-
- // Estimate different operations to be optimized out
- switch (I->getOpcode()) {
- case Instruction::ExtractElement: {
- ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
- unsigned Idx = -1;
- if (CI)
- Idx = CI->getZExtValue();
- return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
- }
- case Instruction::InsertElement: {
- ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
- unsigned Idx = -1;
- if (CI)
- Idx = CI->getZExtValue();
- return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
- }
- case Instruction::Call: {
- if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
- SmallVector<Value *, 4> Args(II->arg_operands());
- FastMathFlags FMF;
- if (auto *FPMO = dyn_cast<FPMathOperator>(II))
- FMF = FPMO->getFastMathFlags();
- return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
- FMF);
- } else {
- return BaseT::getUserCost(U, Operands);
- }
- }
- case Instruction::ShuffleVector: {
- const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
- Type *Ty = Shuffle->getType();
- Type *SrcTy = Shuffle->getOperand(0)->getType();
-
- // TODO: Identify and add costs for insert subvector, etc.
- int SubIndex;
- if (Shuffle->isExtractSubvectorMask(SubIndex))
- return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);
-
- if (Shuffle->changesLength())
- return BaseT::getUserCost(U, Operands);
-
- if (Shuffle->isIdentity())
- return 0;
-
- if (Shuffle->isReverse())
- return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);
-
- if (Shuffle->isSelect())
- return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);
-
- if (Shuffle->isTranspose())
- return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);
-
- if (Shuffle->isZeroEltSplat())
- return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);
-
- if (Shuffle->isSingleSource())
- return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);
-
- return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast: {
- return getCastInstrCost(I->getOpcode(), I->getType(),
- I->getOperand(0)->getType(), I);
- }
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::FNeg: {
- return getArithmeticInstrCost(I->getOpcode(), I->getType(),
- TTI::OK_AnyValue, TTI::OK_AnyValue,
- TTI::OP_None, TTI::OP_None, Operands, I);
- }
- default:
- break;
- }
-
- return BaseT::getUserCost(U, Operands);
+void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ CommonTTI.getPeelingPreferences(L, SE, PP);
}
unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
@@ -903,7 +1041,7 @@ unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
}
bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ Align Alignment,
unsigned AddrSpace) const {
// We allow vectorization of flat stores, even though we may need to decompose
// them later if they may access private memory. We don't have enough context
@@ -912,13 +1050,13 @@ bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
}
bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ Align Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ Align Alignment,
unsigned AddrSpace) const {
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
@@ -932,14 +1070,18 @@ unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 8;
}
-unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
+unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind) {
+ if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+ return Opcode == Instruction::PHI ? 0 : 1;
+
// XXX - For some reason this isn't called for switch.
switch (Opcode) {
case Instruction::Br:
case Instruction::Ret:
return 10;
default:
- return BaseT::getCFInstrCost(Opcode);
+ return BaseT::getCFInstrCost(Opcode, CostKind);
}
}
@@ -970,3 +1112,8 @@ void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
CommonTTI.getUnrollingPreferences(L, SE, UP);
}
+
+void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ CommonTTI.getPeelingPreferences(L, SE, PP);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 0b48f9f602b71..3364a9bcaccbb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -61,6 +61,9 @@ public:
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
+
+ void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP);
};
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@@ -70,10 +73,11 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
friend BaseT;
const GCNSubtarget *ST;
- const AMDGPUTargetLowering *TLI;
+ const SITargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
bool HasFP32Denormals;
+ unsigned MaxVGPRs;
const FeatureBitset InlineFeatureIgnoreList = {
// Codegen control options which don't matter.
@@ -133,13 +137,21 @@ public:
TLI(ST->getTargetLowering()),
CommonTTI(TM, F),
IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
- HasFP32Denormals(ST->hasFP32Denormals(F)) { }
+ HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()),
+ MaxVGPRs(ST->getMaxNumVGPRs(
+ std::max(ST->getWavesPerEU(F).first,
+ ST->getWavesPerEUForWorkGroup(
+ ST->getFlatWorkGroupSizes(F).second)))) {}
bool hasBranchDivergence() { return true; }
+ bool useGPUDivergenceAnalysis() const;
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
+ void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP);
+
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
return TTI::PSK_FastHardware;
@@ -147,6 +159,7 @@ public:
unsigned getHardwareNumberOfRegisters(bool Vector) const;
unsigned getNumberOfRegisters(bool Vector) const;
+ unsigned getNumberOfRegisters(unsigned RCID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -157,22 +170,30 @@ public:
VectorType *VecTy) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
- bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
- bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
- bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
-
+ Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+ unsigned SrcAddrSpace, unsigned DestAddrSpace,
+ unsigned SrcAlign, unsigned DestAlign) const;
+
+ void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
+ LLVMContext &Context,
+ unsigned RemainingBytes,
+ unsigned SrcAddrSpace,
+ unsigned DestAddrSpace,
+ unsigned SrcAlign,
+ unsigned DestAlign) const;
unsigned getMaxInterleaveFactor(unsigned VF);
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -180,7 +201,10 @@ public:
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- unsigned getCFInstrCost(unsigned Opcode);
+ unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+
+ bool isInlineAsmSourceOfDivergence(const CallInst *CI,
+ ArrayRef<unsigned> Indices = {}) const;
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
bool isSourceOfDivergence(const Value *V) const;
@@ -196,13 +220,13 @@ public:
bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const;
- bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
- Value *OldV, Value *NewV) const;
+ Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+ Value *NewV) const;
unsigned getVectorSplitCost() { return 0; }
- unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp);
+ unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+ VectorType *SubTp);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
@@ -211,23 +235,17 @@ public:
int getInlinerVectorBonusPercent() { return 0; }
- int getArithmeticReductionCost(unsigned Opcode,
- Type *Ty,
- bool IsPairwise);
- template <typename T>
- int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<T *> Args, FastMathFlags FMF,
- unsigned VF);
- int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF,
- unsigned ScalarizationCostPassed = UINT_MAX);
- int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF,
- unsigned VF = 1);
- int getMinMaxReductionCost(Type *Ty, Type *CondTy,
- bool IsPairwiseForm,
- bool IsUnsigned);
- unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+ int getArithmeticReductionCost(
+ unsigned Opcode,
+ VectorType *Ty,
+ bool IsPairwise,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+
+ int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ int getMinMaxReductionCost(
+ VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
};
class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
@@ -245,28 +263,28 @@ public:
: BaseT(TM, F.getParent()->getDataLayout()),
ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()),
- CommonTTI(TM, F) {}
+ CommonTTI(TM, F) {}
const R600Subtarget *getST() const { return ST; }
const AMDGPUTargetLowering *getTLI() const { return TLI; }
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
+ void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP);
unsigned getHardwareNumberOfRegisters(bool Vec) const;
unsigned getNumberOfRegisters(bool Vec) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
- bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
+ bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
- bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
- bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
- unsigned Alignment,
+ bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getCFInstrCost(unsigned Opcode);
+ unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 191f603a66d6a..418296684d765 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -34,6 +34,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
return true;
}
+static void removeDoneExport(Function &F) {
+ ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
+ if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
+ Intrin->setArgOperand(6, BoolFalse); // done
+ } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
+ Intrin->setArgOperand(4, BoolFalse); // done
+ }
+ }
+ }
+ }
+}
+
static BasicBlock *unifyReturnBlockSet(Function &F,
ArrayRef<BasicBlock *> ReturningBlocks,
+ bool InsertExport,
const TargetTransformInfo &TTI,
StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
// nodes (if the function returns values), and convert all of the return
// instructions into unconditional branches.
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+ IRBuilder<> B(NewRetBlock);
+
+ if (InsertExport) {
+ // Ensure that there's only one "done" export in the shader by removing the
+ // "done" bit set on the original final export. More than one "done" export
+ // can lead to undefined behavior.
+ removeDoneExport(F);
+
+ Value *Undef = UndefValue::get(B.getFloatTy());
+ B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
+ {
+ B.getInt32(9), // target, SQ_EXP_NULL
+ B.getInt32(0), // enabled channels
+ Undef, Undef, Undef, Undef, // values
+ B.getTrue(), // done
+ B.getTrue(), // valid mask
+ });
+ }
PHINode *PN = nullptr;
if (F.getReturnType()->isVoidTy()) {
- ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+ B.CreateRetVoid();
} else {
// If the function doesn't return void... add a PHI node to the block...
- PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
- "UnifiedRetVal");
- NewRetBlock->getInstList().push_back(PN);
- ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+ PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
+ "UnifiedRetVal");
+ assert(!InsertExport);
+ B.CreateRet(PN);
}
// Loop over all of the blocks, replacing the return instruction with an
@@ -160,7 +195,11 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- if (PDT.getRoots().size() <= 1)
+
+ // If there's only one exit, we don't need to do anything, unless this is a
+ // pixel shader and that exit is an infinite loop, since we still have to
+ // insert an export in that case.
+ if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
return false;
LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
@@ -168,15 +207,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
+ SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;
// Dummy return block for infinite loop.
BasicBlock *DummyReturnBB = nullptr;
- for (BasicBlock *BB : PDT.getRoots()) {
+ bool InsertExport = false;
+
+ bool Changed = false;
+ for (BasicBlock *BB : PDT.roots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
+ else
+ UniformlyReachedRetBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
@@ -188,6 +233,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+
+ // For pixel shaders, the producer guarantees that an export is
+ // executed before each return instruction. However, if there is an
+ // infinite loop and we insert a return ourselves, we need to uphold
+ // that guarantee by inserting a null export. This can happen e.g. in
+ // an infinite loop with kill instructions, which is supposed to
+ // terminate. However, we don't need to do this if there is a non-void
+ // return value, since then there is an epilog afterwards which will
+ // still export.
+ //
+ // Note: In the case where only some threads enter the infinite loop,
+ // this can result in the null export happening redundantly after the
+ // original exports. However, The last "real" export happens after all
+ // the threads that didn't enter an infinite loop converged, which
+ // means that the only extra threads to execute the null export are
+ // threads that entered the infinite loop, and they only could've
+ // exited through being killed which sets their exec bit to 0.
+ // Therefore, unless there's an actual infinite loop, which can have
+ // invalid results, or there's a kill after the last export, which we
+ // assume the frontend won't do, this export will have the same exec
+ // mask as the last "real" export, and therefore the valid mask will be
+ // overwritten with the same value and will still be correct. Also,
+ // even though this forces an extra unnecessary export wait, we assume
+ // that this happens rare enough in practice to that we don't have to
+ // worry about performance.
+ if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+ RetTy->isVoidTy()) {
+ InsertExport = true;
+ }
+
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}
@@ -206,6 +281,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
BB->getTerminator()->eraseFromParent();
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
}
+ Changed = true;
}
}
@@ -224,6 +300,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
BB->getTerminator()->eraseFromParent();
BranchInst::Create(UnreachableBlock, BB);
}
+ Changed = true;
}
if (!ReturningBlocks.empty()) {
@@ -247,19 +324,32 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// actually reached here.
ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
ReturningBlocks.push_back(UnreachableBlock);
+ Changed = true;
}
}
// Now handle return blocks.
if (ReturningBlocks.empty())
- return false; // No blocks return
+ return Changed; // No blocks return
- if (ReturningBlocks.size() == 1)
- return false; // Already has a single return block
+ if (ReturningBlocks.size() == 1 && !InsertExport)
+ return Changed; // Already has a single return block
const TargetTransformInfo &TTI
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+ // Unify returning blocks. If we are going to insert the export it is also
+ // necessary to include blocks that are uniformly reached, because in addition
+ // to inserting the export the "done" bits on existing exports will be cleared
+ // and we do not want to end up with the normal export in a non-unified,
+ // uniformly reached block with the "done" bit cleared.
+ auto BlocksToUnify = std::move(ReturningBlocks);
+ if (InsertExport) {
+ BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
+ UniformlyReachedRetBlocks.end());
+ }
+
+ unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
+ "UnifiedReturnBlock");
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f3aa1a5823689..013b7a0cf25d1 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -163,6 +163,7 @@ public:
ImmTyUNorm,
ImmTyDA,
ImmTyR128A16,
+ ImmTyA16,
ImmTyLWE,
ImmTyExpTgt,
ImmTyExpCompr,
@@ -277,6 +278,7 @@ public:
isRegClass(AMDGPU::VReg_96RegClassID) ||
isRegClass(AMDGPU::VReg_128RegClassID) ||
isRegClass(AMDGPU::VReg_160RegClassID) ||
+ isRegClass(AMDGPU::VReg_192RegClassID) ||
isRegClass(AMDGPU::VReg_256RegClassID) ||
isRegClass(AMDGPU::VReg_512RegClassID) ||
isRegClass(AMDGPU::VReg_1024RegClassID);
@@ -315,6 +317,7 @@ public:
bool isUNorm() const { return isImmTy(ImmTyUNorm); }
bool isDA() const { return isImmTy(ImmTyDA); }
bool isR128A16() const { return isImmTy(ImmTyR128A16); }
+ bool isGFX10A16() const { return isImmTy(ImmTyA16); }
bool isLWE() const { return isImmTy(ImmTyLWE); }
bool isOff() const { return isImmTy(ImmTyOff); }
bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -486,7 +489,7 @@ public:
}
bool isVSrcB16() const {
- return isVCSrcF16() || isLiteralImm(MVT::i16);
+ return isVCSrcB16() || isLiteralImm(MVT::i16);
}
bool isVSrcV2B16() const {
@@ -654,7 +657,7 @@ public:
bool isSendMsg() const;
bool isSwizzle() const;
bool isSMRDOffset8() const;
- bool isSMRDOffset20() const;
+ bool isSMEMOffset() const;
bool isSMRDLiteralOffset() const;
bool isDPP8() const;
bool isDPPCtrl() const;
@@ -847,6 +850,7 @@ public:
case ImmTyUNorm: OS << "UNorm"; break;
case ImmTyDA: OS << "DA"; break;
case ImmTyR128A16: OS << "R128A16"; break;
+ case ImmTyA16: OS << "A16"; break;
case ImmTyLWE: OS << "LWE"; break;
case ImmTyOff: OS << "Off"; break;
case ImmTyExpTgt: OS << "ExpTgt"; break;
@@ -1062,17 +1066,20 @@ private:
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
RegisterKind RegKind, unsigned Reg1);
- bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
- unsigned& RegNum, unsigned& RegWidth);
- unsigned ParseRegularReg(RegisterKind &RegKind,
- unsigned &RegNum,
- unsigned &RegWidth);
- unsigned ParseSpecialReg(RegisterKind &RegKind,
- unsigned &RegNum,
- unsigned &RegWidth);
- unsigned ParseRegList(RegisterKind &RegKind,
- unsigned &RegNum,
- unsigned &RegWidth);
+ bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+ unsigned &RegNum, unsigned &RegWidth,
+ bool RestoreOnFailure = false);
+ bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+ unsigned &RegNum, unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens);
+ unsigned ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum,
+ unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens);
+ unsigned ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum,
+ unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens);
+ unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
+ unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
bool ParseRegRange(unsigned& Num, unsigned& Width);
unsigned getRegularReg(RegisterKind RegKind,
unsigned RegNum,
@@ -1157,6 +1164,10 @@ public:
return AMDGPU::hasPackedD16(getSTI());
}
+ bool hasGFX10A16() const {
+ return AMDGPU::hasGFX10A16(getSTI());
+ }
+
bool isSI() const {
return AMDGPU::isSI(getSTI());
}
@@ -1177,6 +1188,10 @@ public:
return AMDGPU::isGFX10(getSTI());
}
+ bool isGFX10_BEncoding() const {
+ return AMDGPU::isGFX10_BEncoding(getSTI());
+ }
+
bool hasInv2PiInlineImm() const {
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
}
@@ -1226,8 +1241,12 @@ public:
bool isForcedSDWA() const { return ForcedSDWA; }
ArrayRef<unsigned> getMatchedVariants() const;
- std::unique_ptr<AMDGPUOperand> parseRegister();
+ std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false);
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
+ bool RestoreOnFailure);
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
unsigned checkTargetMatchPredicate(MCInst &Inst) override;
unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) override;
@@ -1311,9 +1330,11 @@ private:
void errorExpTgt();
OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
+ SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
+ bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
bool validateSOPLiteral(const MCInst &Inst) const;
bool validateConstantBusLimitations(const MCInst &Inst);
bool validateEarlyClobberLimitations(const MCInst &Inst);
@@ -1329,6 +1350,7 @@ private:
bool validateOpSel(const MCInst &Inst);
bool validateVccOperand(unsigned Reg) const;
bool validateVOP3Literal(const MCInst &Inst) const;
+ bool validateMAIAccWrite(const MCInst &Inst);
unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -1390,7 +1412,7 @@ public:
AMDGPUOperand::Ptr defaultSLC() const;
AMDGPUOperand::Ptr defaultSMRDOffset8() const;
- AMDGPUOperand::Ptr defaultSMRDOffset20() const;
+ AMDGPUOperand::Ptr defaultSMEMOffset() const;
AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
AMDGPUOperand::Ptr defaultFlatOffset() const;
@@ -1524,6 +1546,16 @@ static bool isSafeTruncation(int64_t Val, unsigned Size) {
return isUIntN(Size, Val) || isIntN(Size, Val);
}
+static bool isInlineableLiteralOp16(int64_t Val, MVT VT, bool HasInv2Pi) {
+ if (VT.getScalarType() == MVT::i16) {
+ // FP immediate values are broken.
+ return isInlinableIntLiteral(Val);
+ }
+
+ // f16/v2f16 operands work correctly for all values.
+ return AMDGPU::isInlinableLiteral16(Val, HasInv2Pi);
+}
+
bool AMDGPUOperand::isInlinableImm(MVT type) const {
// This is a hack to enable named inline values like
@@ -1555,9 +1587,9 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
return false;
if (type.getScalarSizeInBits() == 16) {
- return AMDGPU::isInlinableLiteral16(
+ return isInlineableLiteralOp16(
static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
- AsmParser->hasInv2PiInlineImm());
+ type, AsmParser->hasInv2PiInlineImm());
}
// Check if single precision literal is inlinable
@@ -1577,9 +1609,9 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
}
if (type.getScalarSizeInBits() == 16) {
- return AMDGPU::isInlinableLiteral16(
+ return isInlineableLiteralOp16(
static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
- AsmParser->hasInv2PiInlineImm());
+ type, AsmParser->hasInv2PiInlineImm());
}
return AMDGPU::isInlinableLiteral32(
@@ -1901,6 +1933,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 3: return AMDGPU::VReg_96RegClassID;
case 4: return AMDGPU::VReg_128RegClassID;
case 5: return AMDGPU::VReg_160RegClassID;
+ case 6: return AMDGPU::VReg_192RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
case 32: return AMDGPU::VReg_1024RegClassID;
@@ -1919,7 +1952,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
default: return -1;
case 1: return AMDGPU::SGPR_32RegClassID;
case 2: return AMDGPU::SGPR_64RegClassID;
+ case 3: return AMDGPU::SGPR_96RegClassID;
case 4: return AMDGPU::SGPR_128RegClassID;
+ case 5: return AMDGPU::SGPR_160RegClassID;
+ case 6: return AMDGPU::SGPR_192RegClassID;
case 8: return AMDGPU::SGPR_256RegClassID;
case 16: return AMDGPU::SGPR_512RegClassID;
}
@@ -1928,7 +1964,11 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
default: return -1;
case 1: return AMDGPU::AGPR_32RegClassID;
case 2: return AMDGPU::AReg_64RegClassID;
+ case 3: return AMDGPU::AReg_96RegClassID;
case 4: return AMDGPU::AReg_128RegClassID;
+ case 5: return AMDGPU::AReg_160RegClassID;
+ case 6: return AMDGPU::AReg_192RegClassID;
+ case 8: return AMDGPU::AReg_256RegClassID;
case 16: return AMDGPU::AReg_512RegClassID;
case 32: return AMDGPU::AReg_1024RegClassID;
}
@@ -1975,12 +2015,13 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("tma_hi", AMDGPU::TMA_HI)
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
+ .Case("pc", AMDGPU::PC_REG)
.Case("null", AMDGPU::SGPR_NULL)
.Default(AMDGPU::NoRegister);
}
bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
- SMLoc &EndLoc) {
+ SMLoc &EndLoc, bool RestoreOnFailure) {
auto R = parseRegister();
if (!R) return true;
assert(R->isReg());
@@ -1990,6 +2031,25 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
return false;
}
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+}
+
+OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ bool Result =
+ ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+ bool PendingErrors = getParser().hasPendingError();
+ getParser().clearPendingErrors();
+ if (PendingErrors)
+ return MatchOperand_ParseFail;
+ if (Result)
+ return MatchOperand_NoMatch;
+ return MatchOperand_Success;
+}
+
bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
RegisterKind RegKind, unsigned Reg1) {
switch (RegKind) {
@@ -2166,31 +2226,31 @@ AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
return true;
}
-unsigned
-AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
- unsigned &RegNum,
- unsigned &RegWidth) {
+unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
+ unsigned &RegNum, unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens) {
assert(isToken(AsmToken::Identifier));
unsigned Reg = getSpecialRegForName(getTokenStr());
if (Reg) {
RegNum = 0;
RegWidth = 1;
RegKind = IS_SPECIAL;
+ Tokens.push_back(getToken());
lex(); // skip register name
}
return Reg;
}
-unsigned
-AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
- unsigned &RegNum,
- unsigned &RegWidth) {
+unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
+ unsigned &RegNum, unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens) {
assert(isToken(AsmToken::Identifier));
StringRef RegName = getTokenStr();
const RegInfo *RI = getRegularRegInfo(RegName);
if (!RI)
return AMDGPU::NoRegister;
+ Tokens.push_back(getToken());
lex(); // skip register name
RegKind = RI->Kind;
@@ -2209,10 +2269,9 @@ AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
return getRegularReg(RegKind, RegNum, RegWidth);
}
-unsigned
-AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
- unsigned &RegNum,
- unsigned &RegWidth) {
+unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
+ unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens) {
unsigned Reg = AMDGPU::NoRegister;
if (!trySkipToken(AsmToken::LBrac))
@@ -2229,7 +2288,8 @@ AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
RegisterKind NextRegKind;
unsigned NextReg, NextRegNum, NextRegWidth;
- if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth))
+ if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth,
+ Tokens))
return AMDGPU::NoRegister;
if (NextRegWidth != 1)
return AMDGPU::NoRegister;
@@ -2248,24 +2308,40 @@ AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
return Reg;
}
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
- unsigned &Reg,
- unsigned &RegNum,
- unsigned &RegWidth) {
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+ unsigned &RegNum, unsigned &RegWidth,
+ SmallVectorImpl<AsmToken> &Tokens) {
Reg = AMDGPU::NoRegister;
if (isToken(AsmToken::Identifier)) {
- Reg = ParseSpecialReg(RegKind, RegNum, RegWidth);
+ Reg = ParseSpecialReg(RegKind, RegNum, RegWidth, Tokens);
if (Reg == AMDGPU::NoRegister)
- Reg = ParseRegularReg(RegKind, RegNum, RegWidth);
+ Reg = ParseRegularReg(RegKind, RegNum, RegWidth, Tokens);
} else {
- Reg = ParseRegList(RegKind, RegNum, RegWidth);
+ Reg = ParseRegList(RegKind, RegNum, RegWidth, Tokens);
}
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
}
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+ unsigned &RegNum, unsigned &RegWidth,
+ bool RestoreOnFailure) {
+ Reg = AMDGPU::NoRegister;
+
+ SmallVector<AsmToken, 1> Tokens;
+ if (ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, Tokens)) {
+ if (RestoreOnFailure) {
+ while (!Tokens.empty()) {
+ getLexer().UnLex(Tokens.pop_back_val());
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
Optional<StringRef>
AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) {
switch (RegKind) {
@@ -2314,7 +2390,8 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
return true;
}
-std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
+std::unique_ptr<AMDGPUOperand>
+AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
const auto &Tok = Parser.getTok();
SMLoc StartLoc = Tok.getLoc();
SMLoc EndLoc = Tok.getEndLoc();
@@ -2758,16 +2835,22 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm());
case 2: {
const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
+ if (OperandType == AMDGPU::OPERAND_REG_IMM_INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_INLINE_AC_INT16)
+ return AMDGPU::isInlinableIntLiteral(Val);
+
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
- OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
+ OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
+ return AMDGPU::isInlinableIntLiteralV216(Val);
+
+ if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
- OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
- OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) {
+ OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
- } else {
- return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
- }
+
+ return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
}
default:
llvm_unreachable("invalid operand size");
@@ -3085,6 +3168,30 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
return !isSGPR(mc2PseudoReg(Reg), TRI);
}
+bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
+
+ const unsigned Opc = Inst.getOpcode();
+
+ if (Opc != AMDGPU::V_ACCVGPR_WRITE_B32_vi)
+ return true;
+
+ const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ assert(Src0Idx != -1);
+
+ const MCOperand &Src0 = Inst.getOperand(Src0Idx);
+ if (!Src0.isReg())
+ return true;
+
+ auto Reg = Src0.getReg();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (isSGPR(mc2PseudoReg(Reg), TRI)) {
+ Error(getLoc(), "source operand must be either a VGPR or an inline constant");
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
@@ -3335,6 +3442,46 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
return true;
}
+SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const {
+ for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+ if (Op.isSMEMOffset())
+ return Op.getStartLoc();
+ }
+ return getLoc();
+}
+
+bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst,
+ const OperandVector &Operands) {
+ if (isCI() || isSI())
+ return true;
+
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & SIInstrFlags::SMRD) == 0)
+ return true;
+
+ auto Opcode = Inst.getOpcode();
+ auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset);
+ if (OpNum == -1)
+ return true;
+
+ const auto &Op = Inst.getOperand(OpNum);
+ if (!Op.isImm())
+ return true;
+
+ uint64_t Offset = Op.getImm();
+ bool IsBuffer = AMDGPU::getSMEMIsBuffer(Opcode);
+ if (AMDGPU::isLegalSMRDEncodedUnsignedOffset(getSTI(), Offset) ||
+ AMDGPU::isLegalSMRDEncodedSignedOffset(getSTI(), Offset, IsBuffer))
+ return true;
+
+ Error(getSMEMOffsetLoc(Operands),
+ (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" :
+ "expected a 21-bit signed offset");
+
+ return false;
+}
+
bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
unsigned Opcode = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opcode);
@@ -3512,6 +3659,12 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateFlatOffset(Inst, Operands)) {
return false;
}
+ if (!validateSMEMOffset(Inst, Operands)) {
+ return false;
+ }
+ if (!validateMAIAccWrite(Inst)) {
+ return false;
+ }
return true;
}
@@ -3556,7 +3709,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return true;
}
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, getSTI());
+ Out.emitInstruction(Inst, getSTI());
return false;
case Match_MissingFeature:
@@ -4307,19 +4460,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
if (Size > LocalMemorySize)
return Error(SizeLoc, "size is too large");
- int64_t Align = 4;
+ int64_t Alignment = 4;
if (getLexer().is(AsmToken::Comma)) {
Lex();
SMLoc AlignLoc = getLexer().getLoc();
- if (getParser().parseAbsoluteExpression(Align))
+ if (getParser().parseAbsoluteExpression(Alignment))
return true;
- if (Align < 0 || !isPowerOf2_64(Align))
+ if (Alignment < 0 || !isPowerOf2_64(Alignment))
return Error(AlignLoc, "alignment must be a power of two");
// Alignment larger than the size of LDS is possible in theory, as long
// as the linker manages to place to symbol at address 0, but we do want
// to make sure the alignment fits nicely into a 32-bit integer.
- if (Align >= 1u << 31)
+ if (Alignment >= 1u << 31)
return Error(AlignLoc, "alignment is too large");
}
@@ -4331,7 +4484,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
if (!Symbol->isUndefined())
return Error(NameLoc, "invalid symbol redefinition");
- getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align);
+ getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align(Alignment));
return false;
}
@@ -4650,9 +4803,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
case AsmToken::Identifier: {
StringRef Tok = Parser.getTok().getString();
if (Tok == Name) {
- if (Tok == "r128" && isGFX9())
+ if (Tok == "r128" && !hasMIMG_R128())
Error(S, "r128 modifier is not supported on this GPU");
- if (Tok == "a16" && !isGFX9() && !isGFX10())
+ if (Tok == "a16" && !isGFX9() && !hasGFX10A16())
Error(S, "a16 modifier is not supported on this GPU");
Bit = 1;
Parser.Lex();
@@ -4672,6 +4825,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC)
return MatchOperand_ParseFail;
+ if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
+ ImmTy = AMDGPUOperand::ImmTyR128A16;
+
Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
return MatchOperand_Success;
}
@@ -5987,6 +6143,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
+ if (IsGFX10)
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
if (!IsGFX10)
@@ -6006,8 +6164,8 @@ bool AMDGPUOperand::isSMRDOffset8() const {
return isImm() && isUInt<8>(getImm());
}
-bool AMDGPUOperand::isSMRDOffset20() const {
- return isImm() && isUInt<20>(getImm());
+bool AMDGPUOperand::isSMEMOffset() const {
+ return isImm(); // Offset range is checked later by validator.
}
bool AMDGPUOperand::isSMRDLiteralOffset() const {
@@ -6020,7 +6178,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMEMOffset() const {
return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
}
@@ -6096,7 +6254,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr},
{"da", AMDGPUOperand::ImmTyDA, true, nullptr},
{"r128", AMDGPUOperand::ImmTyR128A16, true, nullptr},
- {"a16", AMDGPUOperand::ImmTyR128A16, true, nullptr},
+ {"a16", AMDGPUOperand::ImmTyA16, true, nullptr},
{"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr},
@@ -6499,7 +6657,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
std::string Token;
if (getLexer().is(AsmToken::Integer)) {
SMLoc Loc = getLexer().getTok().getEndLoc();
- Token = getLexer().getTok().getString();
+ Token = std::string(getLexer().getTok().getString());
Parser.Lex();
if (getLexer().getTok().getLoc() != Loc)
return MatchOperand_ParseFail;
@@ -7032,6 +7190,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
case MCK_AttrChan:
return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+ case MCK_ImmSMEMOffset:
+ return Operand.isSMEMOffset() ? Match_Success : Match_InvalidOperand;
case MCK_SReg_64:
case MCK_SReg_64_XEXEC:
// Null is defined as a 32-bit register but
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 691aff4ecbb8a..fa42ddc54b565 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1,4 +1,4 @@
-//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
+//===-- BUFInstructions.td - Buffer Instruction Definitions ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -374,7 +374,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
let AsmMatchConverter = "";
let hasSideEffects = 1;
- let mayStore = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
// Set everything to 0.
let offen = 0;
@@ -1003,6 +1004,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
>;
+let SubtargetPredicate = HasGFX10_BEncoding in
+defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN <
+ "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32
+>;
+
let SubtargetPredicate = isGFX8GFX9 in {
def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
}
@@ -1152,22 +1158,6 @@ let SubtargetPredicate = isGFX10Plus in {
// MUBUF Patterns
//===----------------------------------------------------------------------===//
-def extract_glc : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_slc : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_dlc : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_swz : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
-}]>;
-
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@@ -1177,24 +1167,24 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1202,9 +1192,9 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
}
@@ -1221,6 +1211,7 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_X
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1228,6 +1219,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
@@ -1256,7 +1248,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1264,8 +1256,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_glc $auxiliary),
(extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1273,8 +1265,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $auxiliary),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_glc $auxiliary),
(extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1283,9 +1275,9 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
- $vdata,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary),
+ getVregSrcForVT<vt>.ret:$vdata,
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary),
(extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1303,6 +1295,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMA
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
@@ -1310,6 +1303,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
@@ -1338,37 +1332,37 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
+ timm:$offset, timm:$cachepolicy, 0)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
+ getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm)),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
+ timm:$offset, timm:$cachepolicy, timm)),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_slc $cachepolicy))
+ (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
+ i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
- (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm)),
+ (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
+ i32:$soffset, timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
- $vdata_in,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+ getVregSrcForVT<vt>.ret:$vdata_in,
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_slc $cachepolicy))
>;
}
@@ -1384,6 +1378,7 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">;
defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1434,19 +1429,20 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
>;
}
+let SubtargetPredicate = HasAtomicFaddInsts in {
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+}
def : GCNPat<
(SIbuffer_atomic_cmpswap
- i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0),
+ i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
+ timm:$offset, timm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
- sub0)
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (extract_slc $cachepolicy)), sub0)
>;
def : GCNPat<
@@ -1456,8 +1452,8 @@ def : GCNPat<
timm:$cachepolicy, timm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
sub0)
>;
@@ -1468,8 +1464,8 @@ def : GCNPat<
timm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
sub0)
>;
@@ -1480,9 +1476,9 @@ def : GCNPat<
timm:$cachepolicy, timm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
- (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
+ (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
sub0)
>;
@@ -1584,7 +1580,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET,
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
foreach vt = Reg32Types.types in {
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, vt, load_private>;
}
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
@@ -1692,8 +1688,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
- (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $format),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1701,8 +1697,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
- (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $format),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1710,8 +1706,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
- (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $format),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1720,9 +1716,9 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
(vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset),
- (as_i8imm $format),
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1739,12 +1735,14 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
@@ -1754,8 +1752,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
- (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
- (as_i16imm $offset), (as_i8imm $format),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1763,8 +1761,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm),
- (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (as_i8imm $format),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1772,8 +1770,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
- (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (as_i8imm $format),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+ (as_i16timm $offset), (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1782,9 +1780,9 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
timm:$offset, timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
- $vdata,
- (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
+ getVregSrcForVT<vt>.ret:$vdata,
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
(extract_swz $auxiliary))
>;
@@ -1801,12 +1799,14 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZ
let SubtargetPredicate = HasUnpackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
@@ -1888,8 +1888,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
}
- multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
- MUBUF_Real_AllAddr_gfx10<op> {
+ multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
def _BOTHEN_RTN_gfx10 :
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
def _IDXEN_RTN_gfx10 :
@@ -1899,6 +1898,8 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
def _OFFSET_RTN_gfx10 :
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
}
+ multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
+ MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>;
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
@@ -2063,6 +2064,8 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx10<0x034>;
+
defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index 1a526675164a0..f4ddbf1131c34 100644
--- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -50,16 +50,19 @@ def COS_cm : COS_Common<0x8E>;
def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
+
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
// RECIP_UINT emulation for Cayman
-// The multiplication scales from [0,1] to the unsigned integer range
+// The multiplication scales from [0,1) to the unsigned integer range,
+// rounding down a bit to avoid unwanted overflow.
def : R600Pat <
(AMDGPUurecip i32:$src0),
(FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
- (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+ (MOV_IMM_I32 CONST.FP_4294966784)))
>;
def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
@@ -70,8 +73,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
-def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-
class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
CF_MEM_RAT_CACHELESS <0x14, 0, mask,
(ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index fe7faca8b1570..beb01b1abf0f8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1,4 +1,4 @@
-//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
+//===-- DSInstructions.td - DS Instruction Definitions --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -388,7 +388,12 @@ defm DS_MAX_U32 : DS_1A1D_NORET_mc<"ds_max_u32">;
defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">;
defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">;
defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">;
+
+let SubtargetPredicate = HasLDSFPAtomics in {
defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">;
+}
+
+// FIXME: Are these really present pre-gfx8?
defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">;
defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">;
@@ -443,7 +448,10 @@ defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>;
defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+
+let SubtargetPredicate = HasLDSFPAtomics in {
defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">;
+}
defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
@@ -497,6 +505,7 @@ def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
}
+let SubtargetPredicate = HasDsSrc2Insts in {
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">;
@@ -529,6 +538,7 @@ def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">;
def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
+} // End SubtargetPredicate = HasDsSrc2Insts
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
@@ -609,10 +619,12 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
}
-def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
-
} // let SubtargetPredicate = isGFX8Plus
+let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in {
+def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
+}
+
//===----------------------------------------------------------------------===//
// DS Patterns
//===----------------------------------------------------------------------===//
@@ -725,7 +737,7 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
-foreach vt = VGPR_32.RegTypes in {
+foreach vt = Reg32Types.types in {
defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
}
@@ -737,31 +749,35 @@ def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
}
-
-class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
- (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+ (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
(inst $ptr, $offset0, $offset1, (i1 0))
>;
-class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
- (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
- (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
- (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
+ (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+ (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
(i1 0))
>;
-// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
-// related to bounds checking.
-let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
-def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
-def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
-}
+multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
+ let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
+ def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
+ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, vt, store_local_m0>;
+ }
-let OtherPredicates = [NotLDSRequiresM0Init] in {
-def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
-def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, vt, load_local>;
+ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, vt, store_local>;
+ }
}
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+foreach vt = VReg_64.RegTypes in {
+defm : DS64Bit4ByteAlignedPat_mc<vt>;
+}
let AddedComplexity = 100 in {
@@ -826,9 +842,12 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+
+let SubtargetPredicate = HasLDSFPAtomics in {
defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
+}
// 64-bit atomics.
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 419513bdc2482..9c2f2e7eecd14 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -18,7 +18,6 @@
#include "Disassembler/AMDGPUDisassembler.h"
#include "AMDGPU.h"
-#include "AMDGPURegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
@@ -101,6 +100,18 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
return addOperand(Inst, MCOperand::createImm(Imm));
}
+static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm,
+ uint64_t Addr, const void *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+ int64_t Offset;
+ if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
+ Offset = Imm & 0xFFFFF;
+ } else { // GFX9+ supports 21-bit signed offsets.
+ Offset = SignExtend64<21>(Imm);
+ }
+ return addOperand(Inst, MCOperand::createImm(Offset));
+}
+
static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
uint64_t Addr, const void *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -285,6 +296,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
+ Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address);
+ if (Res) {
+ if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
+ == -1)
+ break;
+ if (convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
+ MI = MCInst(); // clear
+ }
+ }
+
Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
@@ -334,6 +357,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
if (Res) break;
+ if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
+ Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
+ if (Res) break;
+ }
+
Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
if (Res) break;
@@ -351,13 +379,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
} while (false);
- if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral ||
- !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) {
- MaxInstBytesNum = 8;
- Bytes = Bytes_.slice(0, MaxInstBytesNum);
- eatBytes<uint64_t>(Bytes);
- }
-
if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
@@ -931,6 +952,7 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
return AGPR_32RegClassID;
case OPW64: return AReg_64RegClassID;
case OPW128: return AReg_128RegClassID;
+ case OPW256: return AReg_256RegClassID;
case OPW512: return AReg_512RegClassID;
case OPW1024: return AReg_1024RegClassID;
}
@@ -1202,8 +1224,6 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
raw_ostream &/*cStream*/, int64_t Value,
uint64_t /*Address*/, bool IsBranch,
uint64_t /*Offset*/, uint64_t /*InstSize*/) {
- using SymbolInfoTy = std::tuple<uint64_t, StringRef, uint8_t>;
- using SectionSymbolsTy = std::vector<SymbolInfoTy>;
if (!IsBranch) {
return false;
@@ -1215,11 +1235,11 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
auto Result = std::find_if(Symbols->begin(), Symbols->end(),
[Value](const SymbolInfoTy& Val) {
- return std::get<0>(Val) == static_cast<uint64_t>(Value)
- && std::get<2>(Val) == ELF::STT_NOTYPE;
+ return Val.Addr == static_cast<uint64_t>(Value)
+ && Val.Type == ELF::STT_NOTYPE;
});
if (Result != Symbols->end()) {
- auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result));
+ auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
Inst.addOperand(MCOperand::createExpr(Add));
return true;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 792e26d21f98d..97104a242d8c1 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -69,11 +69,11 @@ multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> {
def _RTN: CF_MEM_RAT <op_ret, 0, 0xf,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
(outs R600_Reg128:$out_gpr),
- name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >;
+ name # "_RTN" # " $rw_gpr, $index_gpr", [] >;
def _NORET: CF_MEM_RAT <op_noret, 0, 0xf,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
(outs R600_Reg128:$out_gpr),
- name ## " $rw_gpr, $index_gpr", [] >;
+ name # " $rw_gpr, $index_gpr", [] >;
}
}
@@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
+
def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;
def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
} // End SubtargetPredicate = isEG
//===----------------------------------------------------------------------===//
@@ -421,6 +422,7 @@ def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
def : UMad24Pat<MULADD_UINT24_eg>;
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : FSHRPattern <BIT_ALIGN_INT_eg>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
@@ -570,7 +572,7 @@ class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
}
class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
- R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+ R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name#"_RET", pattern, "OQAP, "> {
let BaseOp = name;
let usesCustomInserter = 1;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 2057cac346d45..69facada2e964 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1,4 +1,4 @@
-//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
+//===-- FLATInstructions.td - FLAT Instruction Definitions ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -100,7 +100,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
!if(ps.is_flat_scratch, 0b01, 0));
// Signed offset. Highest bit ignored for flat and treated as 12-bit
- // unsigned for flat acceses.
+ // unsigned for flat accesses.
bits<13> offset;
bits<1> nv = 0; // XXX - What does this actually do?
@@ -175,7 +175,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
}
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
- let is_flat_global = 1 in {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -183,8 +183,27 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
}
}
+class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
+ bit HasTiedOutput = 0, bit HasSignedOffset = 0> : FLAT_Pseudo<
+ opName,
+ (outs regClass:$vdst),
+ !con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+ !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+ " $vdst, $saddr$offset$glc$slc$dlc"> {
+ let is_flat_global = 1;
+ let has_data = 0;
+ let mayLoad = 1;
+ let has_vaddr = 0;
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let maybeAtomic = 1;
+
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
+}
+
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
- let is_flat_global = 1 in {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -192,6 +211,24 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
}
}
+class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
+ bit HasSignedOffset = 0> : FLAT_Pseudo<
+ opName,
+ (outs),
+ !con(
+ (ins vdataClass:$vdata, SReg_64:$saddr),
+ (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+ " $vdata, $saddr$offset$glc$slc$dlc"> {
+ let is_flat_global = 1;
+ let mayLoad = 0;
+ let mayStore = 1;
+ let has_vdst = 0;
+ let has_vaddr = 0;
+ let has_saddr = 1;
+ let enabled_saddr = 1;
+ let maybeAtomic = 1;
+}
+
class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
bit EnableSaddr = 0>: FLAT_Pseudo<
opName,
@@ -279,6 +316,7 @@ multiclass FLAT_Atomic_Pseudo<
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
let FPAtomic = isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
}
def _RTN : FLAT_AtomicRet_Pseudo <opName,
@@ -290,6 +328,7 @@ multiclass FLAT_Atomic_Pseudo<
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1>{
let FPAtomic = isFP;
+ let AddedComplexity = -1; // Prefer global atomics if available
}
}
@@ -367,10 +406,12 @@ multiclass FLAT_Global_Atomic_Pseudo<
SDPatternOperator atomic_rtn = null_frag,
SDPatternOperator atomic_no_rtn = null_frag,
ValueType data_vt = vt,
- RegisterClass data_rc = vdst_rc> :
- FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>,
- FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
-
+ RegisterClass data_rc = vdst_rc> {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
+ defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>;
+ defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
+ }
+}
//===----------------------------------------------------------------------===//
// Flat Instructions
@@ -507,7 +548,6 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
} // End SubtargetPredicate = isGFX7GFX10
-let SubtargetPredicate = HasFlatGlobalInsts in {
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -523,6 +563,8 @@ defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16"
defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
+let OtherPredicates = [HasGFX10_BEncoding] in
+def GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
@@ -530,6 +572,8 @@ defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR
defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+let OtherPredicates = [HasGFX10_BEncoding] in
+def GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
@@ -615,9 +659,12 @@ defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
VReg_64, i64, atomic_dec_global_64>;
+
+let SubtargetPredicate = HasGFX10_BEncoding in
+defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
+ VGPR_32, i32, atomic_csub_global_32>;
} // End is_flat_global = 1
-} // End SubtargetPredicate = HasFlatGlobalInsts
let SubtargetPredicate = HasFlatScratchInsts in {
@@ -912,6 +959,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CSUB_RTN, atomic_csub_global_32, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
@@ -1212,6 +1260,9 @@ multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
FLAT_Real_RTN_gfx10<op>,
FLAT_Real_SADDR_RTN_gfx10<op>;
+multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
+ FLAT_Real_RTN_gfx10<op>,
+ FLAT_Real_SADDR_RTN_gfx10<op>;
// ENC_FLAT.
defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>;
@@ -1297,6 +1348,7 @@ defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>;
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>;
defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>;
defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>;
+defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>;
defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>;
defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>;
defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>;
@@ -1325,7 +1377,8 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
-
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x016>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x017>;
// ENC_FLAT_SCRATCH.
defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 10e2c3a263f17..719a968b83147 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -105,6 +105,11 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties()
+ .set(MachineFunctionProperties::Property::IsSSA);
+ }
+
private:
int getDPPOp(unsigned Op) const;
};
@@ -168,7 +173,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
- OrigMI.getDebugLoc(), TII->get(DPPOp));
+ OrigMI.getDebugLoc(), TII->get(DPPOp))
+ .setMIFlags(OrigMI.getFlags());
+
bool Fail = false;
do {
auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
@@ -506,15 +513,32 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
+ auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
+ auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+ if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
+ LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
+ break;
+ }
+
+ assert(Src0 && "Src1 without Src0?");
+ if (Src1 && Src1->isIdenticalTo(*Src0)) {
+ assert(Src1->isReg());
+ LLVM_DEBUG(
+ dbgs()
+ << " " << OrigMI
+ << " failed: DPP register is used more than once per instruction\n");
+ break;
+ }
+
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
- if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+ if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
- } else if (OrigMI.isCommutable() &&
- Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ } else {
+ assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
auto *BB = OrigMI.getParent();
auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
BB->insert(OrigMI, NewMI);
@@ -528,8 +552,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
} else
LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
NewMI->eraseFromParent();
- } else
- LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
+ }
if (Rollback)
break;
OrigMIs.push_back(&OrigMI);
@@ -562,8 +585,6 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TII = ST.getInstrInfo();
- assert(MRI->isSSA() && "Must be run on SSA");
-
bool Changed = false;
for (auto &MBB : MF) {
for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 3ef5a77af45e2..8482dbfec250b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -228,11 +228,6 @@ void GCNHazardRecognizer::processBundle() {
CurrCycleInstr = nullptr;
}
-unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
- IsHazardRecognizerMode = false;
- return PreEmitNoopsCommon(SU->getInstr());
-}
-
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
IsHazardRecognizerMode = true;
CurrCycleInstr = MI;
@@ -486,6 +481,14 @@ void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
addRegsToSet(TRI, MI.uses(), ClauseUses);
}
+static bool breaksSMEMSoftClause(MachineInstr *MI) {
+ return !SIInstrInfo::isSMRD(*MI);
+}
+
+static bool breaksVMEMSoftClause(MachineInstr *MI) {
+ return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
+}
+
int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
// SMEM soft clause are only present on VI+, and only matter if xnack is
// enabled.
@@ -512,7 +515,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
if (!MI)
break;
- if (IsSMRD != SIInstrInfo::isSMRD(*MI))
+ if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
break;
addClauseInst(*MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 6aa2e70dfbfb9..cd17f2755bd10 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -105,7 +105,6 @@ public:
void EmitInstruction(MachineInstr *MI) override;
HazardType getHazardType(SUnit *SU, int Stalls) override;
void EmitNoop() override;
- unsigned PreEmitNoops(SUnit *SU) override;
unsigned PreEmitNoops(MachineInstr *) override;
unsigned PreEmitNoopsCommon(MachineInstr *);
void AdvanceCycle() override;
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 90ab6a14ce20d..75a02c8390343 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -5,6 +5,11 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the class GCNIterativeScheduler.
+///
+//===----------------------------------------------------------------------===//
#include "GCNIterativeScheduler.h"
#include "AMDGPUSubtarget.h"
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
index e6f83914af5ba..a0d4f432aa48d 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -5,6 +5,14 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class GCNIterativeScheduler, which uses an iterative
+/// approach to find a best schedule for GCN architecture. It basically makes
+/// use of various lightweight schedules, scores them, chooses best one based on
+/// their scores, and finally implements the chosen one.
+///
+//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
diff --git a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index c469cf290e264..884b2e17289c5 100644
--- a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -5,6 +5,13 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines and imlements the class GCNMinRegScheduler, which
+/// implements an experimental, simple scheduler whose main goal is to learn
+/// ways about consuming less possible registers for a region.
+///
+//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -207,9 +214,8 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
<< ")'s non-ready successors of " << Priority
<< " priority in ready queue: ");
- const auto SetEnd = Set.end();
for (auto &C : RQ) {
- if (Set.find(C.SU) != SetEnd) {
+ if (Set.count(C.SU)) {
C.Priority = Priority;
LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
}
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index f6023f3a40a27..57346087d0175 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -286,8 +286,15 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
}
Intervals.push_back(LI);
OrigRegs.push_back(VRM->getPhys(Reg));
- MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
- MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
+ if (LI->empty()) {
+ // The address input is undef, so it doesn't contribute to the relevant
+ // range. Seed a reasonable index range if required.
+ if (I == 0)
+ MinInd = MaxInd = LIS->getInstructionIndex(*MI);
+ continue;
+ }
+ MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
+ MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
}
if (Intervals.empty())
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index b926041afb2fe..17e6098d880d5 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -183,3 +183,7 @@ def : ProcessorModel<"gfx1011", GFX10SpeedModel,
def : ProcessorModel<"gfx1012", GFX10SpeedModel,
FeatureISAVersion10_1_2.Features
>;
+
+def : ProcessorModel<"gfx1030", GFX10SpeedModel,
+ FeatureISAVersion10_3_0.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index 76593bc0e5aca..98d971630ca4f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -168,13 +168,15 @@ private:
// 8 banks for SGPRs.
// Registers already processed and recorded in RegsUsed are excluded.
// If Bank is not -1 assume Reg:SubReg to belong to that Bank.
- unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+ uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
- // Return number of stalls in the instructions.
- // UsedBanks has bits set for the banks used by all operands.
- // If Reg and Bank provided substitute the Reg with the Bank.
- unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
- unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
+ // Analyze one instruction returning the number of stalls and a mask of the
+ // banks used by all operands.
+ // If Reg and Bank are provided, assume all uses of Reg will be replaced with
+ // a register chosen from Bank.
+ std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
+ unsigned Reg = AMDGPU::NoRegister,
+ int Bank = -1);
// Return true if register is regular VGPR or SGPR or their tuples.
// Returns false for special registers like m0, vcc etc.
@@ -280,7 +282,9 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
- if (Size > 32)
+ if (Size == 16)
+ Reg = TRI->get32BitRegister(Reg);
+ else if (Size > 32)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
if (TRI->hasVGPRs(RC)) {
@@ -292,7 +296,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
}
-unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
int Bank) {
if (Register::isVirtualRegister(Reg)) {
if (!VRM->isAssignedReg(Reg))
@@ -306,14 +310,21 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
}
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
- if (Size > 1)
- Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+
+ if (Size == 16) {
+ Reg = TRI->get32BitRegister(Reg);
+ Size = 1;
+ } else {
+ Size /= 32;
+ if (Size > 1)
+ Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+ }
if (TRI->hasVGPRs(RC)) {
// VGPRs have 4 banks assigned in a round-robin fashion.
Reg -= AMDGPU::VGPR0;
- unsigned Mask = (1 << Size) - 1;
+ uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
unsigned Used = 0;
// Bitmask lacks an extract method
for (unsigned I = 0; I < Size; ++I)
@@ -321,7 +332,7 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
Used |= 1 << I;
RegsUsed.set(Reg, Reg + Size);
Mask &= ~Used;
- Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
+ Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank);
return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
}
@@ -347,15 +358,14 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
return Mask << SGPR_BANK_OFFSET;
}
-unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
- unsigned& UsedBanks,
- unsigned Reg,
- int Bank) {
+std::pair<unsigned, unsigned>
+GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
+ int Bank) {
unsigned StallCycles = 0;
- UsedBanks = 0;
+ unsigned UsedBanks = 0;
if (MI.isDebugValue())
- return 0;
+ return std::make_pair(StallCycles, UsedBanks);
RegsUsed.reset();
OperandMasks.clear();
@@ -372,30 +382,30 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
unsigned ShiftedBank = Bank;
if (Bank != -1 && R == Reg && Op.getSubReg()) {
- unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
- if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
+ unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
+ LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
+ if (Offset && Bank < NUM_VGPR_BANKS) {
// If a register spans all banks we cannot shift it to avoid conflict.
- if (countPopulation(LM) >= NUM_VGPR_BANKS)
+ if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
continue;
- ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
- } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
+ ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
+ } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
// If a register spans all banks we cannot shift it to avoid conflict.
- if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
+ if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
continue;
- ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
- (countTrailingZeros(LM) >> 1)) %
- NUM_SGPR_BANKS;
+ ShiftedBank = SGPR_BANK_OFFSET +
+ (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
}
}
- unsigned Mask = getRegBankMask(R, Op.getSubReg(),
+ uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
(Reg == R) ? ShiftedBank : -1);
StallCycles += countPopulation(UsedBanks & Mask);
UsedBanks |= Mask;
OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
}
- return StallCycles;
+ return std::make_pair(StallCycles, UsedBanks);
}
unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
@@ -440,10 +450,19 @@ bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
}
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+
+ // TODO: Support 16 bit registers. Those needs to be moved with their
+ // parent VGPR_32 and potentially a sibling 16 bit sub-register.
+ if (Size < 32)
+ return false;
+
if (TRI->hasVGPRs(RC))
return true;
- unsigned Size = TRI->getRegSizeInBits(*RC);
+ if (Size == 16)
+ return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
+
if (Size > 32)
PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
@@ -496,16 +515,16 @@ unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
- unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
- if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
- unsigned Shift = countTrailingZeros(LM);
+ unsigned Offset = TRI->getChannelFromSubReg(SubReg);
+ if (Offset && (Mask & VGPR_BANK_MASK)) {
+ unsigned Shift = Offset;
if (Shift >= NUM_VGPR_BANKS)
return 0;
unsigned VB = FreeBanks & VGPR_BANK_MASK;
FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
VGPR_BANK_MASK;
- } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
- unsigned Shift = countTrailingZeros(LM) >> 1;
+ } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
+ unsigned Shift = Offset >> 1;
if (Shift >= NUM_SGPR_BANKS)
return 0;
unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
@@ -570,7 +589,6 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
unsigned Reg, int Bank,
bool Collect) {
unsigned TotalStallCycles = 0;
- unsigned UsedBanks = 0;
SmallSet<const MachineInstr *, 16> Visited;
for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
@@ -578,7 +596,9 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
continue;
if (!Visited.insert(&MI).second)
continue;
- unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
+ unsigned StallCycles;
+ unsigned UsedBanks;
+ std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
TotalStallCycles += StallCycles;
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
@@ -636,7 +656,11 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
struct BankStall {
BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
- bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
+ bool operator<(const BankStall &RHS) const {
+ if (Stalls == RHS.Stalls)
+ return Bank < RHS.Bank;
+ return Stalls > RHS.Stalls;
+ }
unsigned Bank;
unsigned Stalls;
};
@@ -653,7 +677,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
}
}
}
- std::sort(BankStalls.begin(), BankStalls.end());
+ llvm::sort(BankStalls);
Register OrigReg = VRM->getPhys(C.Reg);
LRM->unassign(LI);
@@ -695,8 +719,9 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
if (MI.isBundle())
continue; // we analyze the instructions inside the bundle individually
- unsigned UsedBanks = 0;
- unsigned StallCycles = analyzeInst(MI, UsedBanks);
+ unsigned StallCycles;
+ unsigned UsedBanks;
+ std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index d593204cba059..86a3cb9af32fa 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -5,6 +5,11 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the GCNRegPressure class.
+///
+//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
#include "AMDGPUSubtarget.h"
@@ -98,7 +103,8 @@ void GCNRegPressure::inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
const MachineRegisterInfo &MRI) {
- if (NewMask == PrevMask)
+ if (SIRegisterInfo::getNumCoveredRegs(NewMask) ==
+ SIRegisterInfo::getNumCoveredRegs(PrevMask))
return;
int Sign = 1;
@@ -106,25 +112,21 @@ void GCNRegPressure::inc(unsigned Reg,
std::swap(NewMask, PrevMask);
Sign = -1;
}
-#ifndef NDEBUG
- const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
-#endif
+
switch (auto Kind = getRegKind(Reg, MRI)) {
case SGPR32:
case VGPR32:
case AGPR32:
- assert(PrevMask.none() && NewMask == MaxMask);
Value[Kind] += Sign;
break;
case SGPR_TUPLE:
case VGPR_TUPLE:
case AGPR_TUPLE:
- assert(NewMask < MaxMask || NewMask == MaxMask);
assert(PrevMask < NewMask);
Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] +=
- Sign * (~PrevMask & NewMask).getNumLanes();
+ Sign * SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask);
if (PrevMask.none()) {
assert(NewMask.any());
@@ -216,7 +218,7 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,
return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
- if (MaxMask == LaneBitmask::getLane(0)) // cannot have subregs
+ if (SIRegisterInfo::getNumCoveredRegs(MaxMask) > 1) // cannot have subregs
return MaxMask;
// For a tentative schedule LIS isn't updated yet but livemask should remain
@@ -327,8 +329,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
// update max pressure
MaxPressure = max(AtMIPressure, MaxPressure);
- for (const auto &MO : MI.defs()) {
- if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
+ for (const auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.isDef() ||
+ !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
continue;
auto Reg = MO.getReg();
@@ -403,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() {
LastTrackedMI = &*NextMI++;
// Add new registers or mask bits.
- for (const auto &MO : LastTrackedMI->defs()) {
- if (!MO.isReg())
+ for (const auto &MO : LastTrackedMI->operands()) {
+ if (!MO.isReg() || !MO.isDef())
continue;
Register Reg = MO.getReg();
if (!Register::isVirtualRegister(Reg))
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 5862cdb041669..2ef79410719f6 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -5,6 +5,14 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the GCNRegPressure class, which tracks registry pressure
+/// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It
+/// also implements a compare function, which compares different register
+/// pressures, and declares one with max occupance as winner.
+///
+//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
@@ -208,7 +216,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
auto SI = SII.getInstructionIndex(*I);
Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex());
}
- std::sort(Indexes.begin(), Indexes.end());
+ llvm::sort(Indexes);
auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e109eed5f6071..deed50b6db7df 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -50,9 +50,9 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
} else {
SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
- SRI->getSGPRPressureSet());
+ AMDGPU::RegisterPressureSets::SReg_32);
VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
- SRI->getVGPRPressureSet());
+ AMDGPU::RegisterPressureSets::VGPR_32);
}
SGPRCriticalLimit -= ErrorMargin;
@@ -83,8 +83,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
}
- unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
- unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+ unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
// If two instructions increase the pressure of different register sets
// by the same amount, the generic scheduler will prefer to schedule the
@@ -109,12 +109,12 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// marked as RegExcess in tryCandidate() when they are compared with
// instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
- Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
- Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
}
@@ -128,10 +128,12 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
if (SGPRDelta >= 0 || VGPRDelta >= 0) {
if (SGPRDelta > VGPRDelta) {
- Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.CriticalMax =
+ PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
} else {
- Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.CriticalMax =
+ PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
}
}
@@ -145,8 +147,8 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SchedCandidate &Cand) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
- unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
- unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+ unsigned SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ unsigned VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
ReadyQueue &Q = Zone.Available;
for (SUnit *SU : Q) {
@@ -231,33 +233,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
- SchedCandidate Cand;
- if (TopCand.Reason == BotCand.Reason) {
- Cand = BotCand;
- GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
- TopCand.Reason = NoCand;
- GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
- if (TopCand.Reason != NoCand) {
- Cand.setBest(TopCand);
- } else {
- TopCand.Reason = TopReason;
- }
- } else {
- if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
- Cand = TopCand;
- } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
- Cand = BotCand;
- } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
- Cand = TopCand;
- } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
- Cand = BotCand;
- } else {
- if (BotCand.Reason > TopCand.Reason) {
- Cand = TopCand;
- } else {
- Cand = BotCand;
- }
- }
+ SchedCandidate Cand = BotCand;
+ TopCand.Reason = NoCand;
+ GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+ if (TopCand.Reason != NoCand) {
+ Cand.setBest(TopCand);
}
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
@@ -316,13 +296,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()),
- MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
+ MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
- if (Stage == 0) {
+ if (Stage == Collect) {
// Just record regions at the first pass.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
return;
@@ -348,6 +328,7 @@ void GCNScheduleDAGMILive::schedule() {
ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+ RescheduleRegions[RegionIdx] = false;
if (!LIS)
return;
@@ -389,20 +370,28 @@ void GCNScheduleDAGMILive::schedule() {
<< MinOccupancy << ".\n");
}
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+ unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+ if (PressureAfter.getVGPRNum() > MaxVGPRs ||
+ PressureAfter.getSGPRNum() > MaxSGPRs)
+ RescheduleRegions[RegionIdx] = true;
+
if (WavesAfter >= MinOccupancy) {
- unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
- unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
- if (WavesAfter > MFI.getMinWavesPerEU() ||
+ if (Stage == UnclusteredReschedule &&
+ !PressureAfter.less(ST, PressureBefore)) {
+ LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+ } else if (WavesAfter > MFI.getMinWavesPerEU() ||
PressureAfter.less(ST, PressureBefore) ||
- (TotalVGPRs >= PressureAfter.getVGPRNum() &&
- TotalSGPRs >= PressureAfter.getSGPRNum())) {
+ !RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
return;
+ } else {
+ LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
- LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+ RescheduleRegions[RegionIdx] = true;
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
@@ -532,33 +521,55 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
+ RescheduleRegions.resize(Regions.size());
+ RescheduleRegions.set();
if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
+ std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
do {
Stage++;
RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;
- if (Stage > 1) {
+ if (Stage > InitialSchedule) {
+ if (!LIS)
+ break;
+
// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom
// to schedule low register pressure blocks.
// Code is partially copied from MachineSchedulerBase::scheduleRegions().
- if (!LIS || StartingOccupancy <= MinOccupancy)
- break;
+ if (Stage == UnclusteredReschedule) {
+ if (RescheduleRegions.none())
+ continue;
+ LLVM_DEBUG(dbgs() <<
+ "Retrying function scheduling without clustering.\n");
+ }
+
+ if (Stage == ClusteredLowOccupancyReschedule) {
+ if (StartingOccupancy <= MinOccupancy)
+ break;
- LLVM_DEBUG(
- dbgs()
- << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "Retrying function scheduling with lowest recorded occupancy "
+ << MinOccupancy << ".\n");
- S.setTargetOccupancy(MinOccupancy);
+ S.setTargetOccupancy(MinOccupancy);
+ }
}
+ if (Stage == UnclusteredReschedule)
+ SavedMutations.swap(Mutations);
+
for (auto Region : Regions) {
+ if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+ continue;
+
RegionBegin = Region.first;
RegionEnd = Region.second;
@@ -566,7 +577,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
if (MBB) finishBlock();
MBB = RegionBegin->getParent();
startBlock(MBB);
- if (Stage == 1)
+ if (Stage == InitialSchedule)
computeBlockPressure(MBB);
}
@@ -594,5 +605,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
}
finishBlock();
- } while (Stage < 2);
+ if (Stage == UnclusteredReschedule)
+ SavedMutations.swap(Mutations);
+ } while (Stage != LastStage);
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index dd687a930c79a..2d81d9977c31d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -64,6 +64,14 @@ public:
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
+ enum : unsigned {
+ Collect,
+ InitialSchedule,
+ UnclusteredReschedule,
+ ClusteredLowOccupancyReschedule,
+ LastStage = ClusteredLowOccupancyReschedule
+ };
+
const GCNSubtarget &ST;
SIMachineFunctionInfo &MFI;
@@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
+ // Records if a region is not yet scheduled, or schedule has been reverted,
+ // or we generally desire to reschedule it.
+ BitVector RescheduleRegions;
+
// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 1f94ab7991225..ea6e9038fd1e1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -17,6 +17,7 @@
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
#include "llvm/Support/TargetRegistry.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -39,8 +40,8 @@ public:
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
- void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override;
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
bool mayNeedRelaxation(const MCInst &Inst,
const MCSubtargetInfo &STI) const override;
@@ -53,12 +54,13 @@ public:
} //End anonymous namespace
-void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI,
- MCInst &Res) const {
+void AMDGPUAsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
+ MCInst Res;
unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode());
Res.setOpcode(RelaxedOpcode);
Res.addOperand(Inst.getOperand(0));
+ Inst = std::move(Res);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index d352219a7a982..619fde74e88d3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -6,8 +6,10 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPUFixupKinds.h"
#include "AMDGPUMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
@@ -80,6 +82,15 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_ABS64;
}
+ if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) {
+ const auto *SymA = Target.getSymA();
+ assert(SymA);
+
+ Ctx.reportError(Fixup.getLoc(),
+ Twine("undefined label '") + SymA->getSymbol().getName() + "'");
+ return ELF::R_AMDGPU_NONE;
+ }
+
llvm_unreachable("unhandled relocation type");
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index f65dc25d7eec5..fe063d33ea3e0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -18,6 +18,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -26,6 +27,28 @@
using namespace llvm;
using namespace llvm::AMDGPU;
+static cl::opt<bool> Keep16BitSuffixes(
+ "amdgpu-keep-16-bit-reg-suffixes",
+ cl::desc("Keep .l and .h suffixes in asm for debugging purposes"),
+ cl::init(false),
+ cl::ReallyHidden);
+
+void AMDGPUInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ // FIXME: The current implementation of
+ // AsmParser::parseRegisterOrRegisterNumber in MC implies we either emit this
+ // as an integer or we provide a name which represents a physical register.
+ // For CFI instructions we really want to emit a name for the DWARF register
+ // instead, because there may be multiple DWARF registers corresponding to a
+ // single physical register. One case where this problem manifests is with
+ // wave32/wave64 where using the physical register name is ambiguous: if we
+ // write e.g. `.cfi_undefined v0` we lose information about the wavefront
+ // size which we need to encode the register in the final DWARF. Ideally we
+ // would extend MC to support parsing DWARF register names so we could do
+ // something like `.cfi_undefined dwarf_wave32_v0`. For now we just live with
+ // non-pretty DWARF register names in assembly text.
+ OS << RegNo;
+}
+
void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &OS) {
@@ -164,10 +187,10 @@ void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
printU32ImmOperand(MI, OpNo, STI, O);
}
-void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- printU32ImmOperand(MI, OpNo, STI, O);
+ O << formatHex(MI->getOperand(OpNo).getImm());
}
void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
@@ -244,6 +267,11 @@ void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "r128");
}
+void AMDGPUInstPrinter::printGFX10A16(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printNamedBit(MI, OpNo, O, "a16");
+}
+
void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "lwe");
@@ -287,7 +315,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
switch (RegNo) {
case AMDGPU::FP_REG:
case AMDGPU::SP_REG:
- case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
case AMDGPU::PRIVATE_RSRC_REG:
llvm_unreachable("pseudo-register should not ever be emitted");
case AMDGPU::SCC:
@@ -297,7 +324,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
}
#endif
- O << getRegisterName(RegNo);
+ StringRef RegName(getRegisterName(RegNo));
+ if (!Keep16BitSuffixes)
+ if (!RegName.consume_back(".l"))
+ RegName.consume_back(".h");
+
+ O << RegName;
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -346,11 +378,21 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
printOperand(MI, OpNo, STI, O);
}
+void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ int16_t SImm = static_cast<int16_t>(Imm);
+ if (isInlinableIntLiteral(SImm))
+ O << SImm;
+ else
+ O << formatHex(static_cast<uint64_t>(Imm));
+}
+
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
int16_t SImm = static_cast<int16_t>(Imm);
- if (SImm >= -16 && SImm <= 64) {
+ if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}
@@ -518,7 +560,8 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
if (Op.isReg()) {
printRegOperand(Op.getReg(), O, MRI);
} else if (Op.isImm()) {
- switch (Desc.OpInfo[OpNo].OperandType) {
+ const uint8_t OpTy = Desc.OpInfo[OpNo].OperandType;
+ switch (OpTy) {
case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
@@ -535,10 +578,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
printImmediate64(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_IMM_INT16:
+ printImmediateInt16(Op.getImm(), STI, O);
+ break;
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16:
printImmediate16(Op.getImm(), STI, O);
break;
@@ -549,11 +594,19 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
printImmediate32(Op.getImm(), STI, O);
break;
}
+
+ // Deal with 16-bit FP inline immediates not working.
+ if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
+ printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
+ break;
+ }
LLVM_FALLTHROUGH;
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
+ break;
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
printImmediateV216(Op.getImm(), STI, O);
break;
case MCOI::OPERAND_UNKNOWN:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index ba53003e90413..6dfd23ea72e67 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -23,6 +23,7 @@ public:
: MCInstPrinter(MAI, MII, MRI) {}
//Autogenerated by tblgen
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
@@ -60,7 +61,7 @@ private:
raw_ostream &O);
void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+ void printSMEMOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -86,6 +87,8 @@ private:
raw_ostream &O);
void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printGFX10A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printLWE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printD16(const MCInst *MI, unsigned OpNo,
@@ -102,8 +105,12 @@ private:
raw_ostream &O);
void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printImmediateIntV216(uint32_t Imm, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -112,6 +119,10 @@ private:
raw_ostream &O);
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printOperand(MI, OpNum, STI, O);
+ }
void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 9644e66fda4e5..687cfef4559f3 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -43,6 +43,9 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
WeakRefDirective = ".weakref\t";
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
+ DwarfRegNumForCFI = true;
+
+ UseIntegratedAssembler = false;
}
bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 62757a7078905..d7d8c8181b02f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -51,6 +51,12 @@ public:
return 0;
}
+ virtual unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 9507836c64c2b..7d3235efc59e6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -61,7 +61,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
if (TT.getArch() == Triple::r600)
InitR600MCRegisterInfo(X, 0);
else
- InitAMDGPUMCRegisterInfo(X, 0);
+ InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG);
+ return X;
+}
+
+MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG, DwarfFlavour);
return X;
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 9754d31fee600..b9cdbc6502e57 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,6 +33,10 @@ class Target;
class Triple;
class raw_pwrite_stream;
+enum AMDGPUDwarfFlavour { Wave64 = 0, Wave32 = 1 };
+
+MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
+
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
MCContext &Ctx);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index fef665c2900ef..3d202d7960d65 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -43,7 +43,7 @@ using namespace llvm::AMDGPU::HSAMD;
bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
- if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
+ if (HSAMD::fromString(std::string(HSAMetadataString), HSAMetadata))
return false;
return EmitHSAMetadata(HSAMetadata);
@@ -97,6 +97,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
@@ -148,6 +149,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
+ case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -210,9 +212,9 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
}
void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
- unsigned Align) {
- OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align
- << '\n';
+ Align Alignment) {
+ OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", "
+ << Alignment.value() << '\n';
}
bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
@@ -393,9 +395,9 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
// AMDGPUTargetELFStreamer
//===----------------------------------------------------------------------===//
-AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
- MCStreamer &S, const MCSubtargetInfo &STI)
- : AMDGPUTargetStreamer(S), Streamer(S) {
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S,
+ const MCSubtargetInfo &STI)
+ : AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) {
MCAssembler &MCA = getStreamer().getAssembler();
unsigned EFlags = MCA.getELFHeaderEFlags();
@@ -427,7 +429,7 @@ void AMDGPUTargetELFStreamer::finish() {
if (Blob.empty())
return;
EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type,
- [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); });
+ [&](MCELFStreamer &OS) { OS.emitBytes(Blob); });
}
void AMDGPUTargetELFStreamer::EmitNote(
@@ -438,16 +440,22 @@ void AMDGPUTargetELFStreamer::EmitNote(
auto NameSZ = Name.size() + 1;
+ unsigned NoteFlags = 0;
+ // TODO Apparently, this is currently needed for OpenCL as mentioned in
+ // https://reviews.llvm.org/D74995
+ if (Os == Triple::AMDHSA)
+ NoteFlags = ELF::SHF_ALLOC;
+
S.PushSection();
- S.SwitchSection(Context.getELFSection(
- ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
- S.EmitIntValue(NameSZ, 4); // namesz
- S.EmitValue(DescSZ, 4); // descz
- S.EmitIntValue(NoteType, 4); // type
- S.EmitBytes(Name); // name
- S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
+ S.SwitchSection(
+ Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags));
+ S.emitInt32(NameSZ); // namesz
+ S.emitValue(DescSZ, 4); // descz
+ S.emitInt32(NoteType); // type
+ S.emitBytes(Name); // name
+ S.emitValueToAlignment(4, 0, 1, 0); // padding 0
EmitDesc(S); // desc
- S.EmitValueToAlignment(4, 0, 1, 0); // padding 0
+ S.emitValueToAlignment(4, 0, 1, 0); // padding 0
S.PopSection();
}
@@ -458,8 +466,8 @@ void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
- OS.EmitIntValue(Major, 4);
- OS.EmitIntValue(Minor, 4);
+ OS.emitInt32(Major);
+ OS.emitInt32(Minor);
});
}
@@ -478,15 +486,15 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
- OS.EmitIntValue(VendorNameSize, 2);
- OS.EmitIntValue(ArchNameSize, 2);
- OS.EmitIntValue(Major, 4);
- OS.EmitIntValue(Minor, 4);
- OS.EmitIntValue(Stepping, 4);
- OS.EmitBytes(VendorName);
- OS.EmitIntValue(0, 1); // NULL terminate VendorName
- OS.EmitBytes(ArchName);
- OS.EmitIntValue(0, 1); // NULL terminte ArchName
+ OS.emitInt16(VendorNameSize);
+ OS.emitInt16(ArchNameSize);
+ OS.emitInt32(Major);
+ OS.emitInt32(Minor);
+ OS.emitInt32(Stepping);
+ OS.emitBytes(VendorName);
+ OS.emitInt8(0); // NULL terminate VendorName
+ OS.emitBytes(ArchName);
+ OS.emitInt8(0); // NULL terminte ArchName
});
}
@@ -495,7 +503,7 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
MCStreamer &OS = getStreamer();
OS.PushSection();
- OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
+ OS.emitBytes(StringRef((const char*)&Header, sizeof(Header)));
OS.PopSection();
}
@@ -507,9 +515,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
}
void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
- unsigned Align) {
- assert(isPowerOf2_32(Align));
-
+ Align Alignment) {
MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol);
SymbolELF->setType(ELF::STT_OBJECT);
@@ -518,7 +524,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
SymbolELF->setExternal(true);
}
- if (SymbolELF->declareCommon(Size, Align, true)) {
+ if (SymbolELF->declareCommon(Size, Alignment.value(), true)) {
report_fatal_error("Symbol: " + Symbol->getName() +
" redeclared as different type");
}
@@ -539,9 +545,9 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
[&](MCELFStreamer &OS) {
- OS.EmitLabel(DescBegin);
- OS.EmitBytes(IsaVersionString);
- OS.EmitLabel(DescEnd);
+ OS.emitLabel(DescBegin);
+ OS.emitBytes(IsaVersionString);
+ OS.emitLabel(DescEnd);
});
return true;
}
@@ -566,9 +572,9 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
[&](MCELFStreamer &OS) {
- OS.EmitLabel(DescBegin);
- OS.EmitBytes(HSAMetadataString);
- OS.EmitLabel(DescEnd);
+ OS.emitLabel(DescBegin);
+ OS.emitBytes(HSAMetadataString);
+ OS.emitLabel(DescEnd);
});
return true;
}
@@ -590,9 +596,9 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
[&](MCELFStreamer &OS) {
- OS.EmitLabel(DescBegin);
- OS.EmitBytes(HSAMetadataString);
- OS.EmitLabel(DescEnd);
+ OS.emitLabel(DescBegin);
+ OS.emitBytes(HSAMetadataString);
+ OS.emitLabel(DescEnd);
});
return true;
}
@@ -602,9 +608,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
MCStreamer &OS = getStreamer();
OS.PushSection();
- OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
+ OS.emitValueToAlignment(64, Encoded_s_code_end, 4);
for (unsigned I = 0; I < 48; ++I)
- OS.EmitIntValue(Encoded_s_code_end, 4);
+ OS.emitInt32(Encoded_s_code_end);
OS.PopSection();
return true;
}
@@ -637,22 +643,22 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT)
KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
- Streamer.EmitLabel(KernelDescriptorSymbol);
- Streamer.EmitBytes(StringRef(
+ Streamer.emitLabel(KernelDescriptorSymbol);
+ Streamer.emitBytes(StringRef(
(const char*)&(KernelDescriptor),
offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
// FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
// expression being created is:
// (start of kernel code) - (start of kernel descriptor)
// It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
- Streamer.EmitValue(MCBinaryExpr::createSub(
+ Streamer.emitValue(MCBinaryExpr::createSub(
MCSymbolRefExpr::create(
KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
MCSymbolRefExpr::create(
KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
Context),
sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
- Streamer.EmitBytes(StringRef(
+ Streamer.emitBytes(StringRef(
(const char*)&(KernelDescriptor) +
offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 683b3e363b9aa..a19d4646deb26 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -54,7 +54,7 @@ public:
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
- unsigned Align) = 0;
+ Align Alignment) = 0;
/// \returns True on success, false on failure.
virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
@@ -110,7 +110,7 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
- void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+ void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
@@ -133,6 +133,7 @@ public:
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
MCStreamer &Streamer;
+ Triple::OSType Os;
void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc);
@@ -157,7 +158,7 @@ public:
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
- void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+ void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 2f1f4e7a03928..f614705730501 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -47,7 +47,7 @@ public:
/// Encode the instruction and write it to the OS.
void encodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ const MCSubtargetInfo &STI) const override;
/// \returns the encoding for an MCOperand.
uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index f8ec3c36f0190..2cd6c3a81d2bf 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -13,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "AMDGPURegisterInfo.h"
#include "MCTargetDesc/AMDGPUFixupKinds.h"
#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -71,6 +70,10 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+ unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
@@ -105,6 +108,11 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) {
return 0;
}
+static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) {
+ uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+ return IntImm == 0 ? 255 : IntImm;
+}
+
static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
if (IntImm != 0)
@@ -249,23 +257,27 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT16:
- case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
// FIXME Is this correct? What do inline immediates do on SI for f16 src
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
-
case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16: {
if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+ if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
+ return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
LLVM_FALLTHROUGH;
+ }
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
uint32_t Encoding = getLit16Encoding(Lo16, STI);
@@ -359,6 +371,15 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
return getMachineOpValue(MI, MO, Fixups, STI);
}
+unsigned SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ auto Offset = MI.getOperand(OpNo).getImm();
+ // VI only supports 20-bit unsigned offsets.
+ assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset));
+ return Offset;
+}
+
unsigned
SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
@@ -419,7 +440,13 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
// instructions use acc[0:1] modifier bits to distinguish. These bits are
// encoded as a virtual 9th bit of the register for these operands.
if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg))
+ MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
Enc |= 512;
return Enc;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 4006a6205fb87..2bfc2d5795333 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1,4 +1,4 @@
-//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
+//===-- MIMGInstructions.td - MIMG Instruction Definitions ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -35,6 +35,7 @@ class MIMGBaseOpcode : PredicateControl {
bit Gather4 = 0;
bits<8> NumExtraArgs = 0;
bit Gradients = 0;
+ bit G16 = 0;
bit Coordinates = 1;
bit LodOrClampOrMip = 0;
bit HasD16 = 0;
@@ -47,9 +48,9 @@ def MIMGBaseOpcode : GenericEnum {
def MIMGBaseOpcodesTable : GenericTable {
let FilterClass = "MIMGBaseOpcode";
let CppTypeName = "MIMGBaseOpcodeInfo";
- let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
- "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
- "HasD16"];
+ let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+ "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
+ "LodOrClampOrMip", "HasD16"];
GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
let PrimaryKey = ["BaseOpcode"];
@@ -117,6 +118,22 @@ def MIMGMIPMappingTable : GenericTable {
let PrimaryKeyName = "getMIMGMIPMappingInfo";
}
+class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> {
+ MIMGBaseOpcode G = g;
+ MIMGBaseOpcode G16 = g16;
+}
+
+def MIMGG16MappingTable : GenericTable {
+ let FilterClass = "MIMGG16Mapping";
+ let CppTypeName = "MIMGG16MappingInfo";
+ let Fields = ["G", "G16"];
+ GenericEnum TypeOf_G = MIMGBaseOpcode;
+ GenericEnum TypeOf_G16 = MIMGBaseOpcode;
+
+ let PrimaryKey = ["G"];
+ let PrimaryKeyName = "getMIMGG16MappingInfo";
+}
+
class MIMG_Base <dag outs, string dns = "">
: InstSI <outs, (ins), "", []> {
@@ -132,7 +149,6 @@ class MIMG_Base <dag outs, string dns = "">
let DecoderNamespace = dns;
let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
- let usesCustomInserter = 1;
}
class MIMG <dag outs, string dns = "">
@@ -238,9 +254,9 @@ class MIMG_NoSampler_gfx10<int op, string opcode,
: MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -251,9 +267,9 @@ class MIMG_NoSampler_nsa_gfx10<int op, string opcode,
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -331,9 +347,9 @@ class MIMG_Store_gfx10<int op, string opcode,
: MIMG_gfx10<op, (outs), dns> {
let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -345,9 +361,9 @@ class MIMG_Store_nsa_gfx10<int op, string opcode,
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -436,8 +452,8 @@ class MIMG_Atomic_gfx10<mimg op, string opcode,
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe);
- let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
+ GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
+ let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
}
class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
@@ -452,8 +468,8 @@ class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
AddrIns,
(ins SReg_256:$srsrc, DMask:$dmask,
Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe));
- let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
+ SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
}
multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
@@ -522,10 +538,10 @@ class MIMG_Sampler_gfx10<int op, string opcode,
: MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
- GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
- #"$dlc$glc$slc$r128$tfe$lwe"
+ #"$dlc$glc$slc$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -536,10 +552,10 @@ class MIMG_Sampler_nsa_gfx10<int op, string opcode,
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
- SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+ SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
!if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
- #"$dlc$glc$slc$r128$tfe$lwe"
+ #"$dlc$glc$slc$r128$a16$tfe$lwe"
#!if(BaseOpcode.HasD16, "$d16", "");
}
@@ -646,10 +662,11 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
}
multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
- bit isGetLod = 0,
- string asm = "image_sample"#sample.LowerCaseMod> {
+ bit isG16 = 0, bit isGetLod = 0,
+ string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
def "" : MIMG_Sampler_BaseOpcode<sample> {
let HasD16 = !if(isGetLod, 0, 1);
+ let G16 = isG16;
}
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -726,76 +743,95 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
//} // End let FPAtomic = 1
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
-
-defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
-
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>;
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+let SubtargetPredicate = HasGFX10_BEncoding in
+defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
+
/********** ========================================= **********/
/********** Table of dimension-aware image intrinsics **********/
/********** ========================================= **********/
@@ -817,6 +853,11 @@ def ImageDimIntrinsicTable : GenericTable {
let PrimaryKeyEarlyOut = 1;
}
+def getImageDimInstrinsicByBaseOpcode : SearchIndex {
+ let Table = ImageDimIntrinsicTable;
+ let Key = ["BaseOpcode", "Dim"];
+}
+
foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
AMDGPUImageDimAtomicIntrinsics) in {
def : ImageDimIntrinsicInfo<intr>;
@@ -835,3 +876,21 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
// MIP to NONMIP Optimization Mapping
def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
+
+// G to G16 Optimization Mapping
+def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D, IMAGE_SAMPLE_C_D_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL, IMAGE_SAMPLE_C_D_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD, IMAGE_SAMPLE_CD_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL, IMAGE_SAMPLE_CD_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD, IMAGE_SAMPLE_C_CD_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL, IMAGE_SAMPLE_C_CD_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL_O_G16>;
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index ed23c8ea814b4..d363baa15507a 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -88,15 +88,15 @@ void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
}
}
- OutStreamer->EmitIntValue(RsrcReg, 4);
- OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+ OutStreamer->emitInt32(RsrcReg);
+ OutStreamer->emitIntValue(S_NUM_GPRS(MaxGPR + 1) |
S_STACK_SIZE(MFI->CFStackSize), 4);
- OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
- OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+ OutStreamer->emitInt32(R_02880C_DB_SHADER_CONTROL);
+ OutStreamer->emitInt32(S_02880C_KILL_ENABLE(killPixel));
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
- OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
- OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+ OutStreamer->emitInt32(R_0288E8_SQ_LDS_ALLOC);
+ OutStreamer->emitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
}
}
@@ -115,7 +115,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitProgramInfoR600(MF);
- EmitFunctionBody();
+ emitFunctionBody();
if (isVerbose()) {
MCSectionELF *CommentSection =
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.h b/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
index 0da9526d716ea..552d01f81b66c 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -26,7 +26,7 @@ public:
StringRef getPassName() const override;
bool runOnMachineFunction(MachineFunction &MF) override;
/// Implemented in AMDGPUMCInstLower.cpp
- void EmitInstruction(const MachineInstr *MI) override;
+ void emitInstruction(const MachineInstr *MI) override;
/// Lower the specified LLVM Constant to an MCExpr.
/// The AsmPrinter::lowerConstantof does not know how to lower
/// addrspacecast, therefore they should be lowered by this function.
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index e4160ac11c863..8124df68f6886 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -159,8 +159,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
}
void CFStack::updateMaxStackSize() {
- unsigned CurrentStackSize =
- CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
+ unsigned CurrentStackSize = CurrentEntries + divideCeil(CurrentSubEntries, 4);
MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
}
@@ -308,7 +307,7 @@ private:
DstMI = Reg;
else
DstMI = TRI->getMatchingSuperReg(Reg,
- AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
&R600::R600_Reg128RegClass);
}
if (MO.isUse()) {
@@ -317,7 +316,7 @@ private:
SrcMI = Reg;
else
SrcMI = TRI->getMatchingSuperReg(Reg,
- AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+ R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
&R600::R600_Reg128RegClass);
}
}
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index fd75c41040e16..5f682d86d26e2 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -219,13 +219,13 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
}
}
if (IsReduction) {
- unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
+ unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(Chan);
Src0 = TRI.getSubReg(Src0, SubRegIndex);
Src1 = TRI.getSubReg(Src1, SubRegIndex);
} else if (IsCube) {
static const int CubeSrcSwz[] = {2, 2, 0, 1};
- unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
- unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+ unsigned SubRegIndex0 = R600RegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
+ unsigned SubRegIndex1 = R600RegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
Src1 = TRI.getSubReg(Src0, SubRegIndex1);
Src0 = TRI.getSubReg(Src0, SubRegIndex0);
}
@@ -234,7 +234,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
bool Mask = false;
bool NotLast = true;
if (IsCube) {
- unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
+ unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(Chan);
DstReg = TRI.getSubReg(DstReg, SubRegIndex);
} else {
// Mask the write if the original instruction does not write to
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index d9aa9ebe878d8..c568a4aa61c3e 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -18,9 +18,8 @@ using namespace llvm;
R600FrameLowering::~R600FrameLowering() = default;
/// \returns The number of registers allocated for \p FI.
-int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- unsigned &FrameReg) const {
+int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const R600RegisterInfo *RI
= MF.getSubtarget<R600Subtarget>().getRegisterInfo();
@@ -35,15 +34,15 @@ int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
- OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+ OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(i));
OffsetBytes += MFI.getObjectSize(i);
// Each register holds 4 bytes, so we must always align the offset to at
// least 4 bytes, so that 2 frame objects won't share the same register.
- OffsetBytes = alignTo(OffsetBytes, 4);
+ OffsetBytes = alignTo(OffsetBytes, Align(4));
}
if (FI != -1)
- OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+ OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(FI));
return OffsetBytes / (getStackWidth(MF) * 4);
}
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index 283e4d1935ea1..b877ecd298290 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -16,7 +16,7 @@ namespace llvm {
class R600FrameLowering : public AMDGPUFrameLowering {
public:
R600FrameLowering(StackDirection D, Align StackAl, int LAO,
- Align TransAl = Align::None())
+ Align TransAl = Align(1))
: AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~R600FrameLowering() override;
@@ -25,7 +25,7 @@ public:
void emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const override {}
int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
+ Register &FrameReg) const override;
bool hasFP(const MachineFunction &MF) const override {
return false;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 1b1f5f9a404a7..dc2e73e1f94e0 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -615,21 +615,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return LowerImplicitParameter(DAG, VT, DL, 8);
case Intrinsic::r600_read_tgid_x:
+ case Intrinsic::amdgcn_workgroup_id_x:
return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
R600::T1_X, VT);
case Intrinsic::r600_read_tgid_y:
+ case Intrinsic::amdgcn_workgroup_id_y:
return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
R600::T1_Y, VT);
case Intrinsic::r600_read_tgid_z:
+ case Intrinsic::amdgcn_workgroup_id_z:
return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
R600::T1_Z, VT);
case Intrinsic::r600_read_tidig_x:
+ case Intrinsic::amdgcn_workitem_id_x:
return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
R600::T0_X, VT);
case Intrinsic::r600_read_tidig_y:
+ case Intrinsic::amdgcn_workitem_id_y:
return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
R600::T0_Y, VT);
case Intrinsic::r600_read_tidig_z:
+ case Intrinsic::amdgcn_workitem_id_z:
return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
R600::T0_Z, VT);
@@ -699,9 +705,8 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
SmallVector<SDValue, 8> Args;
for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
- Args.push_back(DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
- DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
+ Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+ DAG.getVectorIdxConstant(i, DL)));
}
return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
@@ -1260,10 +1265,11 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return scalarizeVectorStore(StoreNode, DAG);
}
- unsigned Align = StoreNode->getAlignment();
- if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(
- MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
+ Align Alignment = StoreNode->getAlign();
+ if (Alignment < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
+ StoreNode->getMemOperand()->getFlags(),
+ nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
}
@@ -1543,7 +1549,7 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FIN->getIndex();
- unsigned IgnoredFrameReg;
+ Register IgnoredFrameReg;
unsigned Offset =
TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 346296c773775..088cf16d8ed2c 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -77,7 +77,7 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (VectorComponents > 0) {
for (unsigned I = 0; I < VectorComponents; I++) {
- unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I);
+ unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(I);
buildDefaultInstruction(MBB, MI, R600::MOV,
RI.getSubReg(DestReg, SubRegIndex),
RI.getSubReg(SrcReg, SubRegIndex))
@@ -541,7 +541,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs;
ValidSwizzle.clear();
- unsigned ConstCount = 0;
+ unsigned ConstCount;
BankSwizzle TransBS = ALU_VEC_012_SCL_210;
for (unsigned i = 0, e = IG.size(); i < e; ++i) {
IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
@@ -676,7 +676,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
- // Most of the following comes from the ARM implementation of AnalyzeBranch
+ // Most of the following comes from the ARM implementation of analyzeBranch
// If the block has no terminators, it just falls into the block after it.
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
@@ -1224,7 +1224,7 @@ int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
const R600FrameLowering *TFL = ST.getFrameLowering();
- unsigned IgnoredFrameReg;
+ Register IgnoredFrameReg;
Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
return getIndirectIndexBegin(MF) + Offset;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index cbdf0de44f873..2cc21364c4397 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1006,7 +1006,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP <
class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
inst, "MULADD_IEEE",
- [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
+ [(set f32:$dst, (any_fmad f32:$src0, f32:$src1, f32:$src2))]
>;
class FMA_Common <bits<5> inst> : R600_3OP <
@@ -1233,6 +1233,11 @@ def : R600Pat<
def : RcpPat<recip_ieee, f32>;
}
+class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat <
+ (fsqrt f32:$src),
+ (RecipInst (RsqInst $src))
+>;
+
//===----------------------------------------------------------------------===//
// R600 / R700 Instructions
//===----------------------------------------------------------------------===//
@@ -1272,8 +1277,8 @@ let Predicates = [isR600] in {
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
- def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+ def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index cec7f563f4800..b0620663a2300 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -56,9 +56,8 @@ using namespace llvm;
#define DEBUG_TYPE "vec-merger"
-static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
- assert(MRI.isSSA());
- if (Register::isPhysicalRegister(Reg))
+static bool isImplicitlyDef(MachineRegisterInfo &MRI, Register Reg) {
+ if (Reg.isPhysical())
return false;
const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
return MI && MI->isImplicitDef();
@@ -69,8 +68,8 @@ namespace {
class RegSeqInfo {
public:
MachineInstr *Instr;
- DenseMap<unsigned, unsigned> RegToChan;
- std::vector<unsigned> UndefReg;
+ DenseMap<Register, unsigned> RegToChan;
+ std::vector<Register> UndefReg;
RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
assert(MI->getOpcode() == R600::REG_SEQUENCE);
@@ -102,7 +101,7 @@ private:
InstructionSetMap PreviousRegSeqByUndefCount;
bool canSwizzle(const MachineInstr &MI) const;
- bool areAllUsesSwizzeable(unsigned Reg) const;
+ bool areAllUsesSwizzeable(Register Reg) const;
void SwizzleInput(MachineInstr &,
const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge,
@@ -130,6 +129,11 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties()
+ .set(MachineFunctionProperties::Property::IsSSA);
+ }
+
StringRef getPassName() const override {
return "R600 Vector Registers Merge Pass";
}
@@ -165,9 +169,9 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
const {
unsigned CurrentUndexIdx = 0;
- for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+ for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(),
E = ToMerge->RegToChan.end(); It != E; ++It) {
- DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+ DenseMap<Register, unsigned>::const_iterator PosInUntouched =
Untouched->RegToChan.find((*It).first);
if (PosInUntouched != Untouched->RegToChan.end()) {
Remap.push_back(std::pair<unsigned, unsigned>
@@ -203,9 +207,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
DebugLoc DL = Pos->getDebugLoc();
Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
- DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
- std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
- for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+ DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+ std::vector<Register> UpdatedUndef = BaseRSI->UndefReg;
+ for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(),
E = RSI->RegToChan.end(); It != E; ++It) {
Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
unsigned SubReg = (*It).first;
@@ -218,7 +222,7 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
.addReg(SubReg)
.addImm(Chan);
UpdatedRegToChan[SubReg] = Chan;
- std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
+ std::vector<Register>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
if (ChanPos != UpdatedUndef.end())
UpdatedUndef.erase(ChanPos);
assert(!is_contained(UpdatedUndef, Chan) &&
@@ -279,7 +283,7 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
}
}
-bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+bool R600VectorRegMerger::areAllUsesSwizzeable(Register Reg) const {
for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
E = MRI->use_instr_end(); It != E; ++It) {
if (!canSwizzle(*It))
@@ -322,7 +326,7 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
}
void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
- for (DenseMap<unsigned, unsigned>::const_iterator
+ for (DenseMap<Register, unsigned>::const_iterator
It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
}
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index ef12c1d245941..78ef71cdf8e3b 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -20,14 +20,21 @@
using namespace llvm;
-R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) {
- RCW.RegWeight = 0;
- RCW.WeightLimit = 0;
-}
-
#define GET_REGINFO_TARGET_DESC
#include "R600GenRegisterInfo.inc"
+unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) {
+ static const uint16_t SubRegFromChannelTable[] = {
+ R600::sub0, R600::sub1, R600::sub2, R600::sub3,
+ R600::sub4, R600::sub5, R600::sub6, R600::sub7,
+ R600::sub8, R600::sub9, R600::sub10, R600::sub11,
+ R600::sub12, R600::sub13, R600::sub14, R600::sub15
+ };
+
+ assert(Channel < array_lengthof(SubRegFromChannelTable));
+ return SubRegFromChannelTable[Channel];
+}
+
BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
@@ -87,11 +94,6 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
}
}
-const RegClassWeight &R600RegisterInfo::getRegClassWeight(
- const TargetRegisterClass *RC) const {
- return RCW;
-}
-
bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
assert(!Register::isVirtualRegister(Reg));
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 9378b70ca5807..06981c4cf9c5e 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -20,9 +20,11 @@
namespace llvm {
struct R600RegisterInfo final : public R600GenRegisterInfo {
- RegClassWeight RCW;
+ R600RegisterInfo() : R600GenRegisterInfo(0) {}
- R600RegisterInfo();
+ /// \returns the sub reg enum value for the given \p Channel
+ /// (e.g. getSubRegFromChannel(0) -> R600::sub0)
+ static unsigned getSubRegFromChannel(unsigned Channel);
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
@@ -37,8 +39,9 @@ struct R600RegisterInfo final : public R600GenRegisterInfo {
/// CFGStructurizer
const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
- const RegClassWeight &
- getRegClassWeight(const TargetRegisterClass *RC) const override;
+ bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+ return false;
+ }
// \returns true if \p Reg can be defined in one ALU clause and used in
// another.
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
index 02164b74a01bd..fdff7541edeca 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -150,13 +150,16 @@ def AR_X : R600Reg<"AR.x", 0>;
def INDIRECT_BASE_ADDR : R600Reg <"INDIRECT_BASE_ADDR", 0>;
def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
- (add (sequence "ArrayBase%u", 448, 480))>;
+ (add (sequence "ArrayBase%u", 448, 480))> {
+ let Weight = 0;
+}
// special registers for ALU src operands
// const buffer reference, SRCx_SEL contains index
def ALU_CONST : R600Reg<"CBuf", 0>;
// interpolation param reference, SRCx_SEL contains index
def ALU_PARAM : R600Reg<"Param", 0>;
+let Weight = 0 in {
let isAllocatable = 0 in {
def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
@@ -251,3 +254,4 @@ def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64,
def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
(add V01_X, V01_Y, V01_Z, V01_W,
V23_X, V23_Y, V23_Z, V23_W)>;
+} // End let Weight = 0
diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
index ee011286b8ff3..90e48c63b5dca 100644
--- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -111,10 +111,6 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
unsigned ActiveLanes =
TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
- // Subreg indices are counted from 1
- // When D16 then we want next whole VGPR after write data.
- static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
-
bool Packed = !ST.hasUnpackedD16VMem();
unsigned InitIdx =
@@ -137,7 +133,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
// all the result registers to 0, otherwise just the error indication
// register (VGPRn+1)
unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
- unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
+ unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1);
if (DstSize == 1) {
// In this case we can just initialize the result directly
@@ -158,7 +154,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
.addReg(PrevDst)
.addReg(SubReg)
- .addImm(CurrIdx);
+ .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
PrevDst = NewDst;
}
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 27320472cacb3..3c41bf1fef5e9 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -153,7 +153,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else,
{ IntMask, IntMask });
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
- { IntMask, IntMask });
+ { IntMask });
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
}
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 23ef56afc39c9..4f7d255eb450a 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -333,7 +333,9 @@ enum Id { // HwRegCode, (6) [5:0]
ID_FLAT_SCR_HI = 21,
ID_XNACK_MASK = 22,
ID_POPS_PACKER = 25,
- ID_SYMBOLIC_LAST_ = 26,
+ ID_SHADER_CYCLES = 29,
+ ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
+ ID_SYMBOLIC_LAST_ = 30,
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -366,6 +368,28 @@ enum Width : unsigned {
WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
};
+enum ModeRegisterMasks : uint32_t {
+ FP_ROUND_MASK = 0xf << 0, // Bits 0..3
+ FP_DENORM_MASK = 0xf << 4, // Bits 4..7
+ DX10_CLAMP_MASK = 1 << 8,
+ IEEE_MODE_MASK = 1 << 9,
+ LOD_CLAMP_MASK = 1 << 10,
+ DEBUG_MASK = 1 << 11,
+
+ // EXCP_EN fields.
+ EXCP_EN_INVALID_MASK = 1 << 12,
+ EXCP_EN_INPUT_DENORMAL_MASK = 1 << 13,
+ EXCP_EN_FLOAT_DIV0_MASK = 1 << 14,
+ EXCP_EN_OVERFLOW_MASK = 1 << 15,
+ EXCP_EN_UNDERFLOW_MASK = 1 << 16,
+ EXCP_EN_INEXACT_MASK = 1 << 17,
+ EXCP_EN_INT_DIV0_MASK = 1 << 18,
+
+ GPR_IDX_EN_MASK = 1 << 27,
+ VSKIP_MASK = 1 << 28,
+ CSP_MASK = 0x7u << 29 // Bits 29..31
+};
+
} // namespace Hwreg
namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 914d2a5ef1485..ef64c5674bd1c 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -587,6 +587,11 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+ // Only need to run this in SelectionDAG path.
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::Selected))
+ return false;
+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
@@ -761,6 +766,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
bool AllAGPRUses = true;
SetVector<const MachineInstr *> worklist;
SmallSet<const MachineInstr *, 4> Visited;
+ SetVector<MachineInstr *> PHIOperands;
worklist.insert(&MI);
Visited.insert(&MI);
while (!worklist.empty()) {
@@ -805,6 +811,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+ for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+ MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
+ if (DefMI && DefMI->isPHI())
+ PHIOperands.insert(DefMI);
+ }
}
bool hasVGPRInput = false;
@@ -824,8 +835,22 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
else if (Def->isCopy() &&
TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
- hasVGPRInput = true;
- break;
+ Register SrcReg = Def->getOperand(1).getReg();
+ MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
+ unsigned SMovOp;
+ int64_t Imm;
+ if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
+ hasVGPRInput = true;
+ break;
+ } else {
+ // Formally, if we did not do this right away
+ // it would be done on the next iteration of the
+ // runOnMachineFunction main loop. But why not if we can?
+ MachineFunction *MF = MI.getParent()->getParent();
+ Def->getOperand(1).ChangeToImmediate(Imm);
+ Def->addImplicitDefUseOperands(*MF);
+ Def->setDesc(TII->get(SMovOp));
+ }
}
}
@@ -840,4 +865,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
TII->legalizeOperands(MI, MDT);
}
+ // Propagate register class back to PHI operands which are PHI themselves.
+ while (!PHIOperands.empty()) {
+ processPHINode(*PHIOperands.pop_back_val());
+ }
}
diff --git a/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index a0119297b112f..8e3402b537b3b 100644
--- a/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -217,6 +217,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
}
bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+ // Only need to run this in SelectionDAG path.
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::Selected))
+ return false;
+
if (skipFunction(MF.getFunction()))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2ff8baf29394f..ffcf4c30bc70d 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -282,6 +282,9 @@ static bool updateOperand(FoldCandidate &Fold,
assert(!Fold.needsShrink() && "not handled");
if (Fold.isImm()) {
+ // FIXME: ChangeToImmediate should probably clear the subreg flags. It's
+ // reinterpreted as TargetFlags.
+ Old.setSubReg(0);
Old.ChangeToImmediate(Fold.ImmToFold);
return true;
}
@@ -612,19 +615,26 @@ void SIFoldOperands::foldOperand(
if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
// Sanity check that this is a stack access.
// FIXME: Should probably use stack pseudos before frame lowering.
- MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
- if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
- SOff->getReg() != MFI->getStackPtrOffsetReg()))
- return;
if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
MFI->getScratchRSrcReg())
return;
+ // Ensure this is either relative to the current frame or the current wave.
+ MachineOperand &SOff =
+ *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+ if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) &&
+ (!SOff.isImm() || SOff.getImm() != 0))
+ return;
+
// A frame index will resolve to a positive constant, so it should always be
// safe to fold the addressing mode, even pre-GFX9.
UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
- SOff->setReg(MFI->getStackPtrOffsetReg());
+
+ // If this is relative to the current wave, update it to be relative to the
+ // current frame.
+ if (SOff.isImm())
+ SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false);
return;
}
@@ -907,6 +917,21 @@ static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
case AMDGPU::S_XOR_B32:
Result = LHS ^ RHS;
return true;
+ case AMDGPU::S_XNOR_B32:
+ Result = ~(LHS ^ RHS);
+ return true;
+ case AMDGPU::S_NAND_B32:
+ Result = ~(LHS & RHS);
+ return true;
+ case AMDGPU::S_NOR_B32:
+ Result = ~(LHS | RHS);
+ return true;
+ case AMDGPU::S_ANDN2_B32:
+ Result = LHS & ~RHS;
+ return true;
+ case AMDGPU::S_ORN2_B32:
+ Result = LHS | ~RHS;
+ return true;
case AMDGPU::V_LSHL_B32_e64:
case AMDGPU::V_LSHL_B32_e32:
case AMDGPU::S_LSHL_B32:
@@ -1007,10 +1032,16 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
if (!Src0->isImm() && !Src1->isImm())
return false;
- if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
+ if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32 ||
+ MI->getOpcode() == AMDGPU::V_LSHL_ADD_U32 ||
+ MI->getOpcode() == AMDGPU::V_AND_OR_B32) {
if (Src0->isImm() && Src0->getImm() == 0) {
// v_lshl_or_b32 0, X, Y -> copy Y
// v_lshl_or_b32 0, X, K -> v_mov_b32 K
+ // v_lshl_add_b32 0, X, Y -> copy Y
+ // v_lshl_add_b32 0, X, K -> v_mov_b32 K
+ // v_and_or_b32 0, X, Y -> copy Y
+ // v_and_or_b32 0, X, K -> v_mov_b32 K
bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
MI->RemoveOperand(Src1Idx);
MI->RemoveOperand(Src0Idx);
@@ -1381,8 +1412,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
- if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) ||
- (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals))
+ if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
+ (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1411,8 +1442,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64: {
// If output denormals are enabled, omod is ignored.
- if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) ||
- (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals))
+ if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
+ (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
return std::make_pair(nullptr, SIOutMods::NONE);
// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 8364665dda04c..a2e802009d098 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -24,18 +24,6 @@ using namespace llvm;
#define DEBUG_TYPE "frame-info"
-static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
- const MachineFunction &MF) {
- return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
- ST.getMaxNumSGPRs(MF) / 4);
-}
-
-static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
- const MachineFunction &MF) {
- return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
- ST.getMaxNumSGPRs(MF));
-}
-
// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer. We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
@@ -47,10 +35,10 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
- LivePhysRegs &LiveRegs,
- const TargetRegisterClass &RC,
- bool Unused = false) {
+static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
+ LivePhysRegs &LiveRegs,
+ const TargetRegisterClass &RC,
+ bool Unused = false) {
// Mark callee saved registers as used so we will not choose them.
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
for (unsigned i = 0; CSRegs[i]; ++i)
@@ -59,12 +47,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
if (Unused) {
// We are looking for a register that can be used throughout the entire
// function, so any use is unacceptable.
- for (unsigned Reg : RC) {
+ for (MCRegister Reg : RC) {
if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
return Reg;
}
} else {
- for (unsigned Reg : RC) {
+ for (MCRegister Reg : RC) {
if (LiveRegs.available(MRI, Reg))
return Reg;
}
@@ -76,14 +64,67 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
if (!Unused)
report_fatal_error("failed to find free scratch register");
- return AMDGPU::NoRegister;
+ return MCRegister();
}
-static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
- LivePhysRegs LiveRegs;
- LiveRegs.init(*MRI.getTargetRegisterInfo());
- return findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
+ LivePhysRegs &LiveRegs,
+ Register &TempSGPR,
+ Optional<int> &FrameIndex,
+ bool IsFP) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+#ifndef NDEBUG
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+#endif
+
+ // We need to save and restore the current FP/BP.
+
+ // 1: If there is already a VGPR with free lanes, use it. We
+ // may already have to pay the penalty for spilling a CSR VGPR.
+ if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+ int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+
+ FrameIndex = NewFI;
+
+ LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n');
+ return;
+ }
+
+ // 2: Next, try to save the FP/BP in an unused SGPR.
+ TempSGPR = findScratchNonCalleeSaveRegister(
+ MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+
+ if (!TempSGPR) {
+ int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+ // 3: There's no free lane to spill, and no free register to save FP/BP,
+ // so we're forced to spill another VGPR to use for the spill.
+ FrameIndex = NewFI;
+ } else {
+ // 4: If all else fails, spill the FP/BP to memory.
+ FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+ }
+
+ LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n';);
+ } else {
+ LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
+ << printReg(TempSGPR, TRI) << '\n');
+ }
}
// We need to specially emit stack operations here because a different frame
@@ -91,8 +132,8 @@ static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
// use.
static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const SIInstrInfo *TII, unsigned SpillReg,
- unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ const SIInstrInfo *TII, Register SpillReg,
+ Register ScratchRsrcReg, Register SPReg, int FI) {
MachineFunction *MF = MBB.getParent();
MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -100,7 +141,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
MachineMemOperand *MMO = MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
- MFI.getObjectAlignment(FI));
+ MFI.getObjectAlign(FI));
if (isUInt<12>(Offset)) {
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
@@ -139,15 +180,15 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const SIInstrInfo *TII, unsigned SpillReg,
- unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ const SIInstrInfo *TII, Register SpillReg,
+ Register ScratchRsrcReg, Register SPReg, int FI) {
MachineFunction *MF = MBB.getParent();
MachineFrameInfo &MFI = MF->getFrameInfo();
int64_t Offset = MFI.getObjectOffset(FI);
MachineMemOperand *MMO = MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
- MFI.getObjectAlignment(FI));
+ MFI.getObjectAlign(FI));
if (isUInt<12>(Offset)) {
BuildMI(MBB, I, DebugLoc(),
@@ -184,11 +225,13 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addMemOperand(MMO);
}
-void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
- MachineFunction &MF,
- MachineBasicBlock &MBB) const {
+// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
+void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+ MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// We don't need this if we only have spills since there is no user facing
@@ -201,11 +244,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.
- // Debug location must be unknown since the first debug location is used to
- // determine the end of the prologue.
- DebugLoc DL;
- MachineBasicBlock::iterator I = MBB.begin();
-
Register FlatScratchInitReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
@@ -216,8 +254,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
- unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
@@ -266,19 +302,22 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
.addImm(8);
}
-unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI,
- MachineFunction &MF) const {
+// Shift down registers reserved for the scratch RSRC.
+Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
+ MachineFunction &MF) const {
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ assert(MFI->isEntryFunction());
+
+ Register ScratchRsrcReg = MFI->getScratchRSrcReg();
- // We need to insert initialization of the scratch resource descriptor.
- unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
- if (ScratchRsrcReg == AMDGPU::NoRegister ||
- !MRI.isPhysRegUsed(ScratchRsrcReg))
- return AMDGPU::NoRegister;
+ if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
+ return Register();
if (ST.hasSGPRInitBug() ||
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
@@ -293,18 +332,19 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
// cannot do this for the resources required for scratch access. For now we
// skip over user SGPRs and may leave unused holes.
- // We find the resource first because it has an alignment requirement.
-
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
- ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
+ ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
// Skip the last N reserved elements because they should have already been
// reserved for VCC etc.
+ Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPR128s) {
// Pick the first unallocated one. Make sure we don't clobber the other
- // reserved input we needed.
- if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+ // reserved input we needed. Also for PAL, make sure we don't clobber
+ // the GIT pointer passed in SGPR0 or SGPR8.
+ if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
@@ -314,231 +354,138 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
return ScratchRsrcReg;
}
-// Shift down registers reserved for the scratch wave offset.
-std::pair<unsigned, bool>
-SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
- const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-
- assert(MFI->isEntryFunction());
-
- // No replacement necessary.
- if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
- (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
- return std::make_pair(AMDGPU::NoRegister, false);
- }
-
- if (ST.hasSGPRInitBug())
- return std::make_pair(ScratchWaveOffsetReg, false);
-
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-
- ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
- if (NumPreloaded > AllSGPRs.size())
- return std::make_pair(ScratchWaveOffsetReg, false);
-
- AllSGPRs = AllSGPRs.slice(NumPreloaded);
-
- // We need to drop register from the end of the list that we cannot use
- // for the scratch wave offset.
- // + 2 s102 and s103 do not exist on VI.
- // + 2 for vcc
- // + 2 for xnack_mask
- // + 2 for flat_scratch
- // + 4 for registers reserved for scratch resource register
- // + 1 for register reserved for scratch wave offset. (By exluding this
- // register from the list to consider, it means that when this
- // register is being used for the scratch wave offset and there
- // are no other free SGPRs, then the value will stay in this register.
- // + 1 if stack pointer is used.
- // ----
- // 13 (+1)
- unsigned ReservedRegCount = 13;
-
- if (AllSGPRs.size() < ReservedRegCount)
- return std::make_pair(ScratchWaveOffsetReg, false);
-
- bool HandledScratchWaveOffsetReg =
- ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
- bool FPAdjusted = false;
-
- for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
- // Pick the first unallocated SGPR. Be careful not to pick an alias of the
- // scratch descriptor, since we haven’t added its uses yet.
- if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
- if (!HandledScratchWaveOffsetReg) {
- HandledScratchWaveOffsetReg = true;
-
- MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
- if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
- assert(!hasFP(MF));
- MFI->setStackPtrOffsetReg(Reg);
- }
-
- MFI->setScratchWaveOffsetReg(Reg);
- MFI->setFrameOffsetReg(Reg);
- ScratchWaveOffsetReg = Reg;
- FPAdjusted = true;
- break;
- }
- }
- }
-
- return std::make_pair(ScratchWaveOffsetReg, FPAdjusted);
-}
-
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- // If we only have SGPR spills, we won't actually be using scratch memory
- // since these spill to VGPRs.
- //
- // FIXME: We should be cleaning up these unused SGPR spill frame indices
- // somewhere.
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *TRI = &TII->getRegisterInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const Function &F = MF.getFunction();
-
- // We need to do the replacement of the private segment buffer and wave offset
- // register even if there are no stack objects. There could be stores to undef
- // or a constant without an associated object.
+ // FIXME: If we only have SGPR spills, we won't actually be using scratch
+ // memory since these spill to VGPRs. We should be cleaning up these unused
+ // SGPR spill frame indices somewhere.
// FIXME: We still have implicit uses on SGPR spill instructions in case they
// need to spill to vector memory. It's likely that will not happen, but at
// this point it appears we need the setup. This part of the prolog should be
// emitted after frame indices are eliminated.
- if (MFI->hasFlatScratchInit())
- emitFlatScratchInit(ST, MF, MBB);
+ // FIXME: Remove all of the isPhysRegUsed checks
- unsigned ScratchRsrcReg
- = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const Function &F = MF.getFunction();
- unsigned ScratchWaveOffsetReg;
- bool FPAdjusted;
- std::tie(ScratchWaveOffsetReg, FPAdjusted) =
- getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+ assert(MFI->isEntryFunction());
- // We need to insert initialization of the scratch resource descriptor.
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
- unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdHsaOrMesa(F)) {
- PreloadedPrivateBufferReg = MFI->getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
- }
-
- bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
- MRI.isPhysRegUsed(ScratchWaveOffsetReg);
- bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
- MRI.isPhysRegUsed(ScratchRsrcReg);
-
// FIXME: Hack to not crash in situations which emitted an error.
- if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
+ if (!PreloadedScratchWaveOffsetReg)
return;
- // We added live-ins during argument lowering, but since they were not used
- // they were deleted. We're adding the uses now, so add them back.
- MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
- MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
-
- if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
- assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
- MRI.addLiveIn(PreloadedPrivateBufferReg);
- MBB.addLiveIn(PreloadedPrivateBufferReg);
+ // We need to do the replacement of the private segment buffer register even
+ // if there are no stack objects. There could be stores to undef or a
+ // constant without an associated object.
+ //
+ // This will return `Register()` in cases where there are no actual
+ // uses of the SRSRC.
+ Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
+
+ // Make the selected register live throughout the function.
+ if (ScratchRsrcReg) {
+ for (MachineBasicBlock &OtherBB : MF) {
+ if (&OtherBB != &MBB) {
+ OtherBB.addLiveIn(ScratchRsrcReg);
+ }
+ }
}
- // Make the register selected live throughout the function.
- for (MachineBasicBlock &OtherBB : MF) {
- if (&OtherBB == &MBB)
- continue;
-
- if (OffsetRegUsed || FPAdjusted)
- OtherBB.addLiveIn(ScratchWaveOffsetReg);
-
- if (ResourceRegUsed)
- OtherBB.addLiveIn(ScratchRsrcReg);
+ // Now that we have fixed the reserved SRSRC we need to locate the
+ // (potentially) preloaded SRSRC.
+ Register PreloadedScratchRsrcReg;
+ if (ST.isAmdHsaOrMesa(F)) {
+ PreloadedScratchRsrcReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+ if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
+ // We added live-ins during argument lowering, but since they were not
+ // used they were deleted. We're adding the uses now, so add them back.
+ MRI.addLiveIn(PreloadedScratchRsrcReg);
+ MBB.addLiveIn(PreloadedScratchRsrcReg);
+ }
}
+ // Debug location must be unknown since the first debug location is used to
+ // determine the end of the prologue.
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();
- // If we reserved the original input registers, we don't need to copy to the
- // reserved registers.
-
- bool CopyBuffer = ResourceRegUsed &&
- PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
- ST.isAmdHsaOrMesa(F) &&
- ScratchRsrcReg != PreloadedPrivateBufferReg;
-
- // This needs to be careful of the copying order to avoid overwriting one of
- // the input registers before it's been copied to it's final
- // destination. Usually the offset should be copied first.
- bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
- ScratchWaveOffsetReg);
- if (CopyBuffer && CopyBufferFirst) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
- .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ // We found the SRSRC first because it needs four registers and has an
+ // alignment requirement. If the SRSRC that we found is clobbering with
+ // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
+ // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
+ // wave offset to a free SGPR.
+ Register ScratchWaveOffsetReg;
+ if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
+ ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+ AllSGPRs = AllSGPRs.slice(
+ std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
+ Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
+ for (MCPhysReg Reg : AllSGPRs) {
+ if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
+ ScratchWaveOffsetReg = Reg;
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
+ .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+ break;
+ }
+ }
+ } else {
+ ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
}
+ assert(ScratchWaveOffsetReg);
- unsigned SPReg = MFI->getStackPtrOffsetReg();
- assert(SPReg != AMDGPU::SP_REG);
-
- // FIXME: Remove the isPhysRegUsed checks
- const bool HasFP = hasFP(MF);
-
- if (HasFP || OffsetRegUsed) {
- assert(ScratchWaveOffsetReg);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
- .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
+ if (MF.getFrameInfo().hasCalls()) {
+ Register SPReg = MFI->getStackPtrOffsetReg();
+ assert(SPReg != AMDGPU::SP_REG);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
+ .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
}
- if (CopyBuffer && !CopyBufferFirst) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
- .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+ if (hasFP(MF)) {
+ Register FPReg = MFI->getFrameOffsetReg();
+ assert(FPReg != AMDGPU::FP_REG);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- if (ResourceRegUsed) {
- emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
- PreloadedPrivateBufferReg, ScratchRsrcReg);
+ if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
- if (HasFP) {
- DebugLoc DL;
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- int64_t StackSize = FrameInfo.getStackSize();
+ if (MFI->hasFlatScratchInit()) {
+ emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+ }
- // On kernel entry, the private scratch wave offset is the SP value.
- if (StackSize == 0) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
- .addReg(MFI->getScratchWaveOffsetReg());
- } else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
- .addReg(MFI->getScratchWaveOffsetReg())
- .addImm(StackSize * ST.getWavefrontSize());
- }
+ if (ScratchRsrcReg) {
+ emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
+ PreloadedScratchRsrcReg,
+ ScratchRsrcReg, ScratchWaveOffsetReg);
}
}
-// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
- MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
- MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
- unsigned ScratchRsrcReg) const {
+// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
+void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
+ MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, Register PreloadedScratchRsrcReg,
+ Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const Function &Fn = MF.getFunction();
- DebugLoc DL;
if (ST.isAmdPalOS()) {
// The pointer to the GIT is formed from the offset passed in and either
@@ -557,19 +504,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
BuildMI(MBB, I, DL, GetPC64, Rsrc01);
}
- auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
- if (ST.hasMergedShaders()) {
- switch (MF.getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_GS:
- // Low GIT address is passed in s8 rather than s0 for an LS+HS or
- // ES+GS merged shader on gfx9+.
- GitPtrLo = AMDGPU::SGPR8;
- break;
- default:
- break;
- }
- }
+ Register GitPtrLo = MFI->getGITPtrLoReg(MF);
MF.getRegInfo().addLiveIn(GitPtrLo);
MBB.addLiveIn(GitPtrLo);
BuildMI(MBB, I, DL, SMovB32, RsrcLo)
@@ -582,12 +517,12 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
auto MMO = MF.getMachineMemOperand(PtrInfo,
MachineMemOperand::MOLoad |
- MachineMemOperand::MOInvariant |
- MachineMemOperand::MODereferenceable,
- 16, 4);
+ MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 16, Align(4));
unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
- unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
+ unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
.addReg(Rsrc01)
.addImm(EncodedOffset) // offset
@@ -595,10 +530,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
.addImm(0) // dlc
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
.addMemOperand(MMO);
- return;
- }
- if (ST.isMesaGfxShader(Fn)
- || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
+ } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -621,11 +553,11 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
- auto MMO = MF.getMachineMemOperand(PtrInfo,
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOInvariant |
- MachineMemOperand::MODereferenceable,
- 8, 4);
+ auto MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 8, Align(4));
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
.addReg(MFI->getImplicitBufferPtrUserSGPR())
.addImm(0) // offset
@@ -658,7 +590,37 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
BuildMI(MBB, I, DL, SMovB32, Rsrc3)
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ } else if (ST.isAmdHsaOrMesa(Fn)) {
+ assert(PreloadedScratchRsrcReg);
+
+ if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+ .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+ }
}
+
+ // Add the scratch wave offset into the scratch RSRC.
+ //
+ // We only want to update the first 48 bits, which is the base address
+ // pointer, without touching the adjacent 16 bits of flags. We know this add
+ // cannot carry-out from bit 47, otherwise the scratch allocation would be
+ // impossible to fit in the 48-bit global address space.
+ //
+ // TODO: Evaluate if it is better to just construct an SRD using the flat
+ // scratch init and some constants rather than update the one we are passed.
+ Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+
+ // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
+ // the kernel body via inreg arguments.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
+ .addReg(ScratchRsrcSub0)
+ .addReg(ScratchWaveOffsetReg)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
+ .addReg(ScratchRsrcSub1)
+ .addImm(0)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
}
bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
@@ -673,6 +635,50 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
llvm_unreachable("Invalid TargetStackID::Value");
}
+// Activate all lanes, returns saved exec.
+static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
+ MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ bool IsProlog) {
+ Register ScratchExecCopy;
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ DebugLoc DL;
+
+ if (LiveRegs.empty()) {
+ if (IsProlog) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+ } else {
+ // In epilog.
+ LiveRegs.init(*ST.getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegs.stepBackward(*MBBI);
+ }
+ }
+
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+
+ if (!IsProlog)
+ LiveRegs.removeReg(ScratchExecCopy);
+
+ const unsigned OrSaveExec =
+ ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
+
+ return ScratchExecCopy;
+}
+
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -687,51 +693,81 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
- unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
- unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+ Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+ Register BasePtrReg =
+ TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
bool HasFP = false;
+ bool HasBP = false;
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
// To avoid clobbering VGPRs in lanes that weren't active on function entry,
// turn on all lanes before doing the spill to memory.
- unsigned ScratchExecCopy = AMDGPU::NoRegister;
+ Register ScratchExecCopy;
+
+ bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
+ bool SpillFPToMemory = false;
+ // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+ // Otherwise we are spilling the FP to memory.
+ if (HasFPSaveIndex) {
+ SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
+ TargetStackID::SGPRSpill;
+ }
+
+ bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+ bool SpillBPToMemory = false;
+ // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+ // Otherwise we are spilling the BP to memory.
+ if (HasBPSaveIndex) {
+ SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+ TargetStackID::SGPRSpill;
+ }
// Emit the copy if we need an FP, and are using a free SGPR to save it.
- if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ if (FuncInfo->SGPRForFPSaveRestoreCopy) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
.addReg(FramePtrReg)
.setMIFlag(MachineInstr::FrameSetup);
}
+ // Emit the copy if we need a BP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+ FuncInfo->SGPRForBPSaveRestoreCopy)
+ .addReg(BasePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // If a copy has been emitted for FP and/or BP, Make the SGPRs
+ // used in the copy instructions live throughout the function.
+ SmallVector<MCPhysReg, 2> TempSGPRs;
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+
+ if (!TempSGPRs.empty()) {
+ for (MachineBasicBlock &MBB : MF) {
+ for (MCPhysReg Reg : TempSGPRs)
+ MBB.addLiveIn(Reg);
+
+ MBB.sortUniqueLiveIns();
+ }
+ }
+
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
- if (ScratchExecCopy == AMDGPU::NoRegister) {
- if (LiveRegs.empty()) {
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
- if (FuncInfo->SGPRForFPSaveRestoreCopy)
- LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
- }
-
- ScratchExecCopy
- = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
- *TRI.getWaveMaskRegClass());
- assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
-
- const unsigned OrSaveExec = ST.isWave32() ?
- AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
- BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
- ScratchExecCopy)
- .addImm(-1);
- }
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
FuncInfo->getScratchRSrcReg(),
@@ -739,84 +775,153 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Reg.FI.getValue());
}
- if (ScratchExecCopy != AMDGPU::NoRegister) {
+ if (HasFPSaveIndex && SpillFPToMemory) {
+ assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addReg(FramePtrReg);
+
+ buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+ FuncInfo->getScratchRSrcReg(), StackPtrReg,
+ FuncInfo->FramePointerSaveIndex.getValue());
+ }
+
+ if (HasBPSaveIndex && SpillBPToMemory) {
+ assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+ MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+ .addReg(BasePtrReg);
+
+ buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+ FuncInfo->getScratchRSrcReg(), StackPtrReg,
+ *FuncInfo->BasePointerSaveIndex);
+ }
+
+ if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
- .addReg(ScratchExecCopy, RegState::Kill);
+ .addReg(ScratchExecCopy, RegState::Kill);
LiveRegs.addReg(ScratchExecCopy);
}
-
- if (FuncInfo->FramePointerSaveIndex) {
+ // In this case, spill the FP to a reserved VGPR.
+ if (HasFPSaveIndex && !SpillFPToMemory) {
const int FI = FuncInfo->FramePointerSaveIndex.getValue();
- assert(!MFI.isDeadObjectIndex(FI) &&
- MFI.getStackID(FI) == TargetStackID::SGPRSpill);
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
- = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(!MFI.isDeadObjectIndex(FI));
+
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(FI);
assert(Spill.size() == 1);
// Save FP before setting it up.
// FIXME: This should respect spillSGPRToVGPR;
BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill[0].VGPR)
- .addReg(FramePtrReg)
- .addImm(Spill[0].Lane)
- .addReg(Spill[0].VGPR, RegState::Undef);
+ .addReg(FramePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
+ }
+
+ // In this case, spill the BP to a reserved VGPR.
+ if (HasBPSaveIndex && !SpillBPToMemory) {
+ const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+ assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+ assert(Spill.size() == 1);
+
+ // Save BP before setting it up.
+ // FIXME: This should respect spillSGPRToVGPR;
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill[0].VGPR)
+ .addReg(BasePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
}
if (TRI.needsStackRealignment(MF)) {
HasFP = true;
- const unsigned Alignment = MFI.getMaxAlignment();
+ const unsigned Alignment = MFI.getMaxAlign().value();
RoundedSize += Alignment;
if (LiveRegs.empty()) {
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
}
- unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
+ Register ScratchSPReg = findScratchNonCalleeSaveRegister(
MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
- assert(ScratchSPReg != AMDGPU::NoRegister &&
- ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
+ assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
+ ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
- .addReg(StackPtrReg)
- .addImm((Alignment - 1) * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameSetup);
+ .addReg(StackPtrReg)
+ .addImm((Alignment - 1) * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
- .addReg(ScratchSPReg, RegState::Kill)
- .addImm(-Alignment * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameSetup);
+ .addReg(ScratchSPReg, RegState::Kill)
+ .addImm(-Alignment * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
FuncInfo->setIsStackRealigned(true);
} else if ((HasFP = hasFP(MF))) {
- // If we need a base pointer, set it up here. It's whatever the value of
- // the stack pointer is at this point. Any variable size objects will be
- // allocated after this, so we can still use the base pointer to reference
- // locals.
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
- .addReg(StackPtrReg)
- .setMIFlag(MachineInstr::FrameSetup);
+ .addReg(StackPtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // If we need a base pointer, set it up here. It's whatever the value of
+ // the stack pointer is at this point. Any variable size objects will be
+ // allocated after this, so we can still use the base pointer to reference
+ // the incoming arguments.
+ if ((HasBP = TRI.hasBasePointer(MF))) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+ .addReg(StackPtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
}
if (HasFP && RoundedSize != 0) {
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
- .addReg(StackPtrReg)
- .addImm(RoundedSize * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameSetup);
+ .addReg(StackPtrReg)
+ .addImm(RoundedSize * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameSetup);
}
- assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
+ assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
FuncInfo->FramePointerSaveIndex)) &&
"Needed to save FP but didn't save it anywhere");
- assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
+ assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
!FuncInfo->FramePointerSaveIndex)) &&
"Saved FP but didn't need it");
+
+ assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
+ FuncInfo->BasePointerSaveIndex)) &&
+ "Needed to save BP but didn't save it anywhere");
+
+ assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
+ !FuncInfo->BasePointerSaveIndex)) &&
+ "Saved BP but didn't need it");
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -828,81 +933,126 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
LivePhysRegs LiveRegs;
DebugLoc DL;
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint32_t NumBytes = MFI.getStackSize();
- uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
- NumBytes + MFI.getMaxAlignment() : NumBytes;
+ uint32_t RoundedSize = FuncInfo->isStackRealigned()
+ ? NumBytes + MFI.getMaxAlign().value()
+ : NumBytes;
+ const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+ const Register BasePtrReg =
+ TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
+
+ bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
+ bool SpillFPToMemory = false;
+ if (HasFPSaveIndex) {
+ SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
+ TargetStackID::SGPRSpill;
+ }
+
+ bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+ bool SpillBPToMemory = false;
+ if (HasBPSaveIndex) {
+ SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+ TargetStackID::SGPRSpill;
+ }
if (RoundedSize != 0 && hasFP(MF)) {
- const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
.addReg(StackPtrReg)
.addImm(RoundedSize * ST.getWavefrontSize())
.setMIFlag(MachineInstr::FrameDestroy);
}
- if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
- .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
- .setMIFlag(MachineInstr::FrameSetup);
+ if (FuncInfo->SGPRForFPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+ .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
+ .setMIFlag(MachineInstr::FrameSetup);
}
- if (FuncInfo->FramePointerSaveIndex) {
- const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+ if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+ .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
- assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
- MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
+ Register ScratchExecCopy;
+ if (HasFPSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+ assert(!MFI.isDeadObjectIndex(FI));
+ if (SpillFPToMemory) {
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+ FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
+ .addReg(TempVGPR, RegState::Kill);
+ } else {
+ // Reload from VGPR spill.
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ FramePtrReg)
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ }
+ }
- ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
- = FuncInfo->getSGPRToVGPRSpills(FI);
- assert(Spill.size() == 1);
- BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- FuncInfo->getFrameOffsetReg())
- .addReg(Spill[0].VGPR)
- .addImm(Spill[0].Lane);
+ if (HasBPSaveIndex) {
+ const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+ assert(!MFI.isDeadObjectIndex(BasePtrFI));
+ if (SpillBPToMemory) {
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+ MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+ FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
+ .addReg(TempVGPR, RegState::Kill);
+ } else {
+ // Reload from VGPR spill.
+ assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+ FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ BasePtrReg)
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ }
}
- unsigned ScratchExecCopy = AMDGPU::NoRegister;
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
+ FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
- if (ScratchExecCopy == AMDGPU::NoRegister) {
- // See emitPrologue
- if (LiveRegs.empty()) {
- LiveRegs.init(*ST.getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- LiveRegs.stepBackward(*MBBI);
- }
-
- ScratchExecCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, *TRI.getWaveMaskRegClass());
- LiveRegs.removeReg(ScratchExecCopy);
-
- const unsigned OrSaveExec =
- ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
-
- BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
- .addImm(-1);
- }
+ if (!ScratchExecCopy)
+ ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
- FuncInfo->getScratchRSrcReg(),
- FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
+ FuncInfo->getScratchRSrcReg(), StackPtrReg,
+ Reg.FI.getValue());
}
- if (ScratchExecCopy != AMDGPU::NoRegister) {
+ if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
- .addReg(ScratchExecCopy, RegState::Kill);
+ .addReg(ScratchExecCopy, RegState::Kill);
}
}
@@ -920,12 +1070,14 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
#ifndef NDEBUG
static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
- Optional<int> FramePointerSaveIndex) {
+ Optional<int> FramePointerSaveIndex,
+ Optional<int> BasePointerSaveIndex) {
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
if (!MFI.isDeadObjectIndex(I) &&
MFI.getStackID(I) == TargetStackID::SGPRSpill &&
- FramePointerSaveIndex && I != FramePointerSaveIndex) {
+ ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
+ (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
return false;
}
}
@@ -935,7 +1087,7 @@ static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
#endif
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const {
+ Register &FrameReg) const {
const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
FrameReg = RI->getFrameRegister(MF);
@@ -952,7 +1104,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
FuncInfo->removeDeadFrameIndices(MFI);
- assert(allSGPRSpillsAreDead(MFI, None) &&
+ assert(allSGPRSpillsAreDead(MFI, None, None) &&
"SGPR spill should have been removed in SILowerSGPRSpills");
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
@@ -967,9 +1119,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
RS->addScavengingFrameIndex(ScavengeFI);
} else {
int ScavengeFI = MFI.CreateStackObject(
- TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
- TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
- false);
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
RS->addScavengingFrameIndex(ScavengeFI);
}
}
@@ -984,7 +1135,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isEntryFunction())
return;
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1008,46 +1159,19 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
for (auto SSpill : MFI->getSGPRSpillVGPRs())
SavedVGPRs.reset(SSpill.VGPR);
- const bool HasFP = WillHaveFP || hasFP(MF);
- if (!HasFP)
- return;
-
- if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
- int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
- TargetStackID::SGPRSpill);
-
- // If there is already a VGPR with free lanes, use it. We may already have
- // to pay the penalty for spilling a CSR VGPR.
- if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
- llvm_unreachable("allocate SGPR spill should have worked");
-
- MFI->FramePointerSaveIndex = NewFI;
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*TRI);
- LLVM_DEBUG(
- auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
- << ':' << Spill.Lane << '\n');
- return;
+ if (WillHaveFP || hasFP(MF)) {
+ getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
+ MFI->FramePointerSaveIndex, true);
}
- MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
-
- if (!MFI->SGPRForFPSaveRestoreCopy) {
- // There's no free lane to spill, and no free register to save FP, so we're
- // forced to spill another VGPR to use for the spill.
- int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
- TargetStackID::SGPRSpill);
- if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
- llvm_unreachable("allocate SGPR spill should have worked");
- MFI->FramePointerSaveIndex = NewFI;
-
- LLVM_DEBUG(
- auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
- dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
- << ':' << Spill.Lane << '\n';);
- } else {
- LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
- printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+ if (TRI->hasBasePointer(MF)) {
+ if (MFI->SGPRForFPSaveRestoreCopy)
+ LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+ getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
+ MFI->BasePointerSaveIndex, false);
}
}
@@ -1074,14 +1198,31 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
return true; // Early exit if no callee saved registers are modified!
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+ if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
+ !FuncInfo->SGPRForBPSaveRestoreCopy)
return false;
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *RI = ST.getRegisterInfo();
+ Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+ Register BasePtrReg = RI->getBaseRegister();
+ unsigned NumModifiedRegs = 0;
+
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ NumModifiedRegs++;
+ if (FuncInfo->SGPRForBPSaveRestoreCopy)
+ NumModifiedRegs++;
+
for (auto &CS : CSI) {
- if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
- if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
- CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
- break;
+ if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
+ CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ if (--NumModifiedRegs)
+ break;
+ } else if (CS.getReg() == BasePtrReg &&
+ FuncInfo->SGPRForBPSaveRestoreCopy) {
+ CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+ if (--NumModifiedRegs)
+ break;
}
}
@@ -1104,12 +1245,10 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
if (!hasReservedCallFrame(MF)) {
- unsigned Align = getStackAlignment();
-
- Amount = alignTo(Amount, Align);
+ Amount = alignTo(Amount, getStackAlign());
assert(isUInt<32>(Amount) && "exceeded stack address space size");
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned SPReg = MFI->getStackPtrOffsetReg();
+ Register SPReg = MFI->getStackPtrOffsetReg();
unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
BuildMI(MBB, I, DL, TII->get(Op), SPReg)
@@ -1124,19 +1263,17 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.hasCalls()) {
+
+ // For entry functions we can use an immediate offset in most cases, so the
+ // presence of calls doesn't imply we need a distinct frame pointer.
+ if (MFI.hasCalls() &&
+ !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
// All offsets are unsigned, so need to be addressed in the same direction
// as stack growth.
// FIXME: This function is pretty broken, since it can be called before the
// frame layout is determined or CSR spills are inserted.
- if (MFI.getStackSize() != 0)
- return true;
-
- // For the entry point, the input wave scratch offset must be copied to the
- // API SP if there are calls.
- if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
- return true;
+ return MFI.getStackSize() != 0;
}
return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index d9970fd6b4b87..e894320406610 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -21,7 +21,7 @@ class GCNSubtarget;
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
SIFrameLowering(StackDirection D, Align StackAl, int LAO,
- Align TransAl = Align::None())
+ Align TransAl = Align(1))
: AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~SIFrameLowering() override = default;
@@ -32,7 +32,7 @@ public:
void emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
+ Register &FrameReg) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const override;
@@ -55,26 +55,19 @@ public:
MachineBasicBlock::iterator MI) const override;
private:
- void emitFlatScratchInit(const GCNSubtarget &ST,
- MachineFunction &MF,
- MachineBasicBlock &MBB) const;
-
- unsigned getReservedPrivateSegmentBufferReg(
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI,
- MachineFunction &MF) const;
-
- std::pair<unsigned, bool> getReservedPrivateSegmentWaveByteOffsetReg(
- const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- SIMachineFunctionInfo *MFI, MachineFunction &MF) const;
-
- // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
- void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
- MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
- MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
- unsigned ScratchRsrcReg) const;
+ void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register ScratchWaveOffsetReg) const;
+
+ Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
+
+ void emitEntryFunctionScratchRsrcRegSetup(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
+ Register ScratchWaveOffsetReg) const;
public:
bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e73d87cd66afa..d035aa8f72bd7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11,11 +11,6 @@
//
//===----------------------------------------------------------------------===//
-#if defined(_MSC_VER) || defined(__MINGW32__)
-// Provide M_PI.
-#define _USE_MATH_DEFINES
-#endif
-
#include "SIISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
@@ -40,6 +35,7 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -95,14 +91,24 @@ static cl::opt<bool> DisableLoopAlignment(
cl::desc("Do not align and prefetch loops"),
cl::init(false));
+static cl::opt<bool> VGPRReserveforSGPRSpill(
+ "amdgpu-reserve-vgpr-for-sgpr-spill",
+ cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
+
+static cl::opt<bool> UseDivergentRegisterIndexing(
+ "amdgpu-use-divergent-register-indexing",
+ cl::Hidden,
+ cl::desc("Use indirect register addressing for divergent indexes"),
+ cl::init(false));
+
static bool hasFP32Denormals(const MachineFunction &MF) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return Info->getMode().FP32Denormals;
+ return Info->getMode().allFP32Denormals();
}
static bool hasFP64FP16Denormals(const MachineFunction &MF) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return Info->getMode().FP64FP16Denormals;
+ return Info->getMode().allFP64FP16Denormals();
}
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
@@ -141,12 +147,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
- addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+ addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
- addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
+ addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+
+ addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
+ addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+
+ addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
+ addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
@@ -158,10 +173,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
- if (Subtarget->hasMAIInsts()) {
- addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
- }
+ addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -202,6 +215,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
+ setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
+
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
@@ -224,6 +248,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -260,7 +290,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
- MVT::v32i32, MVT::v32f32 }) {
+ MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
+ MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -304,6 +335,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
+ for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
+ }
+
+ for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
+ }
+
+ for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
+ }
+
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
@@ -361,9 +434,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
}
- setOperationAction(ISD::BSWAP, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ // FIXME: This should be narrowed to i32, but that only happens if i64 is
+ // illegal.
+ // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
+ setOperationAction(ISD::BSWAP, MVT::i64, Legal);
+ setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+
// On SI this is s_memtime and s_memrealtime on VI.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
setOperationAction(ISD::TRAP, MVT::Other, Custom);
@@ -376,10 +454,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
}
- // v_mad_f32 does not support denormals. We report it as unconditionally
- // legal, and the context where it is formed will disallow it when fp32
- // denormals are enabled.
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
+ if (Subtarget->hasMadMacF32Insts())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
if (!Subtarget->hasBFI()) {
// fcopysign can be done in a single instruction with BFI.
@@ -463,7 +539,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SREM, MVT::i16, Promote);
setOperationAction(ISD::UREM, MVT::i16, Promote);
- setOperationAction(ISD::BSWAP, MVT::i16, Promote);
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
setOperationAction(ISD::CTTZ, MVT::i16, Promote);
@@ -499,8 +574,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP1 Actions.
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FCOS, MVT::f16, Promote);
- setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Custom);
+ setOperationAction(ISD::FSIN, MVT::f16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
@@ -545,6 +620,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ // v_perm_b32 can handle either of these.
+ setOperationAction(ISD::BSWAP, MVT::i16, Legal);
+ setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
+ setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
+
// XXX - Do these do anything? Vector constants turn into build_vector.
setOperationAction(ISD::Constant, MVT::v2i16, Legal);
setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
@@ -686,6 +766,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
}
+ setOperationAction(ISD::SMULO, MVT::i64, Custom);
+ setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -762,6 +845,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
+ // FIXME: In other contexts we pretend this is a per-function property.
+ setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
+
setSchedulingPreference(Sched::RegPressure);
}
@@ -783,6 +869,7 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
DestVT.getScalarType() == MVT::f32 &&
SrcVT.getScalarType() == MVT::f16 &&
+ // TODO: This probably only requires no input flushing?
!hasFP32Denormals(DAG.getMachineFunction());
}
@@ -877,45 +964,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}
-static MVT memVTFromAggregate(Type *Ty) {
- // Only limited forms of aggregate type currently expected.
- assert(Ty->isStructTy() && "Expected struct type");
-
+static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
+ assert(DMaskLanes != 0);
- Type *ElementType = nullptr;
- unsigned NumElts;
- if (Ty->getContainedType(0)->isVectorTy()) {
- VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
- ElementType = VecComponent->getElementType();
- NumElts = VecComponent->getNumElements();
- } else {
- ElementType = Ty->getContainedType(0);
- NumElts = 1;
+ if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+ unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
+ return EVT::getVectorVT(Ty->getContext(),
+ EVT::getEVT(VT->getElementType()),
+ NumElts);
}
- assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
+ return EVT::getEVT(Ty);
+}
- // Calculate the size of the memVT type from the aggregate
- unsigned Pow2Elts = 0;
- unsigned ElementSize;
- switch (ElementType->getTypeID()) {
- default:
- llvm_unreachable("Unknown type!");
- case Type::IntegerTyID:
- ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
- break;
- case Type::HalfTyID:
- ElementSize = 16;
- break;
- case Type::FloatTyID:
- ElementSize = 32;
- break;
- }
- unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
- Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
+// Peek through TFE struct returns to only use the data size.
+static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
+ auto *ST = dyn_cast<StructType>(Ty);
+ if (!ST)
+ return memVTFromImageData(Ty, DMaskLanes);
- return MVT::getVectorVT(MVT::getVT(ElementType, false),
- Pow2Elts);
+ // Some intrinsics return an aggregate type - special case to work out the
+ // correct memVT.
+ //
+ // Only limited forms of aggregate type currently expected.
+ if (ST->getNumContainedTypes() != 2 ||
+ !ST->getContainedType(1)->isIntegerTy(32))
+ return EVT();
+ return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
}
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -944,17 +1019,40 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MODereferenceable;
if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+ unsigned DMaskLanes = 4;
+
+ if (RsrcIntr->IsImage) {
+ const AMDGPU::ImageDimIntrinsicInfo *Intr
+ = AMDGPU::getImageDimIntrinsicInfo(IntrID);
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+
+ if (!BaseOpcode->Gather4) {
+ // If this isn't a gather, we may have excess loaded elements in the
+ // IR type. Check the dmask for the real number of elements loaded.
+ unsigned DMask
+ = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
+ DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
+ }
+
+ Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
+ } else
+ Info.memVT = EVT::getEVT(CI.getType());
+
+ // FIXME: What does alignment mean for an image?
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType(), true);
- if (Info.memVT == MVT::Other) {
- // Some intrinsics return an aggregate type - special case to work out
- // the correct memVT
- Info.memVT = memVTFromAggregate(CI.getType());
- }
Info.flags |= MachineMemOperand::MOLoad;
} else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+
+ Type *DataTy = CI.getArgOperand(0)->getType();
+ if (RsrcIntr->IsImage) {
+ unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
+ unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
+ Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
+ } else
+ Info.memVT = EVT::getEVT(DataTy);
+
Info.flags |= MachineMemOperand::MOStore;
} else {
// Atomic
@@ -1031,6 +1129,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::amdgcn_global_atomic_csub: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1226,9 +1335,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// addressing modes, so treat them as having no offset like flat
// instructions.
return isLegalFlatAddressingMode(AM);
- } else {
- llvm_unreachable("unhandled address space");
}
+
+ // Assume a user alias of global for unknown address spaces.
+ return isLegalGlobalAddressingMode(AM);
}
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
@@ -1279,9 +1389,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
+ // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
+ // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
- (Align % 4 == 0) : true;
+ Align >= 4 : Align != 2;
}
return true;
@@ -1320,18 +1432,17 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
EVT SITargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
// FIXME: Should account for address space here.
// The default fallback uses the private pointer size as a guess for a type to
// use. Make sure we switch these to 64-bit accesses.
- if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+ if (Op.size() >= 16 &&
+ Op.isDstAligned(Align(4))) // XXX: Should only do for global
return MVT::v4i32;
- if (Size >= 8 && DstAlign >= 4)
+ if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
return MVT::v2i32;
// Use the default.
@@ -1416,9 +1527,10 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
const ArgDescriptor *InputPtrReg;
const TargetRegisterClass *RC;
+ LLT ArgTy;
- std::tie(InputPtrReg, RC)
- = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ std::tie(InputPtrReg, RC, ArgTy) =
+ Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
@@ -1457,7 +1569,7 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
}
if (MemVT.isFloatingPoint())
- Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
+ Val = getFPExtOrFPRound(DAG, Val, SL, VT);
else if (Signed)
Val = DAG.getSExtOrTrunc(Val, SL, VT);
else
@@ -1467,16 +1579,15 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
}
SDValue SITargetLowering::lowerKernargMemParameter(
- SelectionDAG &DAG, EVT VT, EVT MemVT,
- const SDLoc &SL, SDValue Chain,
- uint64_t Offset, unsigned Align, bool Signed,
- const ISD::InputArg *Arg) const {
+ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
+ uint64_t Offset, Align Alignment, bool Signed,
+ const ISD::InputArg *Arg) const {
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
// Try to avoid using an extload by loading earlier than the argument address,
// and extracting the relevant bits. The load should hopefully be merged with
// the previous argument.
- if (MemVT.getStoreSize() < 4 && Align < 4) {
+ if (MemVT.getStoreSize() < 4 && Alignment < 4) {
// TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
int64_t AlignDownOffset = alignDown(Offset, 4);
int64_t OffsetDiff = Offset - AlignDownOffset;
@@ -1502,9 +1613,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
}
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
- SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+ SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachineMemOperand::MOInvariant);
SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
@@ -1565,8 +1676,9 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
const ArgDescriptor *Reg;
const TargetRegisterClass *RC;
+ LLT Ty;
- std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
+ std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
}
@@ -1666,7 +1778,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
if (RegIdx == ArgVGPRs.size()) {
// Spill to stack required.
- int64_t Offset = CCInfo.AllocateStack(4, 4);
+ int64_t Offset = CCInfo.AllocateStack(4, Align(4));
return ArgDescriptor::createStack(Offset, Mask);
}
@@ -1706,10 +1818,11 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
-void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) const {
+/// Allocate implicit function VGPR arguments at the end of allocated user
+/// arguments.
+void SITargetLowering::allocateSpecialInputVGPRs(
+ CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
const unsigned Mask = 0x3ff;
ArgDescriptor Arg;
@@ -1727,6 +1840,20 @@ void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
+/// Allocate implicit function VGPR arguments in fixed registers.
+void SITargetLowering::allocateSpecialInputVGPRsFixed(
+ CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
+ Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
+ if (!Reg)
+ report_fatal_error("failed to allocated VGPR for implicit arguments");
+
+ const unsigned Mask = 0x3ff;
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
+}
+
void SITargetLowering::allocateSpecialInputSGPRs(
CCState &CCInfo,
MachineFunction &MF,
@@ -1742,8 +1869,10 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasQueuePtr())
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
- if (Info.hasKernargSegmentPtr())
- ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
+ // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
+ // constant offset from the kernarg segment.
+ if (Info.hasImplicitArgPtr())
+ ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
if (Info.hasDispatchID())
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
@@ -1758,9 +1887,6 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasWorkGroupIDZ())
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
-
- if (Info.hasImplicitArgPtr())
- ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
}
// Allocate special inputs passed in user SGPRs.
@@ -1916,67 +2042,45 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
Info.setScratchRSrcReg(ReservedBufferReg);
}
- // hasFP should be accurate for kernels even before the frame is finalized.
- if (ST.getFrameLowering()->hasFP(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
- // Try to use s32 as the SP, but move it if it would interfere with input
- // arguments. This won't work with calls though.
- //
- // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
- // registers.
- if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
- Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
- } else {
- assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
+ // For entry functions we have to set up the stack pointer if we use it,
+ // whereas non-entry functions get this "for free". This means there is no
+ // intrinsic advantage to using S32 over S34 in cases where we do not have
+ // calls but do need a frame pointer (i.e. if we are requested to have one
+ // because frame pointer elimination is disabled). To keep things simple we
+ // only ever use S32 as the call ABI stack pointer, and so using it does not
+ // imply we need a separate frame pointer.
+ //
+ // Try to use s32 as the SP, but move it if it would interfere with input
+ // arguments. This won't work with calls though.
+ //
+ // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
+ // registers.
+ if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
+ Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
+ } else {
+ assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
- if (MFI.hasCalls())
- report_fatal_error("call in graphics shader with too many input SGPRs");
+ if (MFI.hasCalls())
+ report_fatal_error("call in graphics shader with too many input SGPRs");
- for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
- if (!MRI.isLiveIn(Reg)) {
- Info.setStackPtrOffsetReg(Reg);
- break;
- }
+ for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
+ if (!MRI.isLiveIn(Reg)) {
+ Info.setStackPtrOffsetReg(Reg);
+ break;
}
-
- if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
- report_fatal_error("failed to find register for SP");
}
- if (MFI.hasCalls()) {
- Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
- Info.setFrameOffsetReg(AMDGPU::SGPR33);
- } else {
- unsigned ReservedOffsetReg =
- TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
- Info.setScratchWaveOffsetReg(ReservedOffsetReg);
- Info.setFrameOffsetReg(ReservedOffsetReg);
- }
- } else if (RequiresStackAccess) {
- assert(!MFI.hasCalls());
- // We know there are accesses and they will be done relative to SP, so just
- // pin it to the input.
- //
- // FIXME: Should not do this if inline asm is reading/writing these
- // registers.
- Register PreloadedSP = Info.getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
- Info.setStackPtrOffsetReg(PreloadedSP);
- Info.setScratchWaveOffsetReg(PreloadedSP);
- Info.setFrameOffsetReg(PreloadedSP);
- } else {
- assert(!MFI.hasCalls());
+ if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
+ report_fatal_error("failed to find register for SP");
+ }
- // There may not be stack access at all. There may still be spills, or
- // access of a constant pointer (in which cases an extra copy will be
- // emitted in the prolog).
- unsigned ReservedOffsetReg
- = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
- Info.setStackPtrOffsetReg(ReservedOffsetReg);
- Info.setScratchWaveOffsetReg(ReservedOffsetReg);
- Info.setFrameOffsetReg(ReservedOffsetReg);
+ // hasFP should be accurate for entry functions even before the frame is
+ // finalized, because it does not rely on the known stack size, only
+ // properties like whether variable sized objects are present.
+ if (ST.getFrameLowering()->hasFP(MF)) {
+ Info.setFrameOffsetReg(AMDGPU::SGPR33);
}
}
@@ -2110,6 +2214,10 @@ SDValue SITargetLowering::LowerFormalArguments(
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+ } else {
+ // For the fixed ABI, pass workitem IDs in the last argument register.
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+ allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
if (IsKernel) {
@@ -2126,9 +2234,9 @@ SDValue SITargetLowering::LowerFormalArguments(
//
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
// kern arg offset.
- const unsigned KernelArgBaseAlign = 16;
+ const Align KernelArgBaseAlign = Align(16);
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
@@ -2143,10 +2251,11 @@ SDValue SITargetLowering::LowerFormalArguments(
EVT MemVT = VA.getLocVT();
const uint64_t Offset = VA.getLocMemOffset();
- unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
+ Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
- SDValue Arg = lowerKernargMemParameter(
- DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
+ SDValue Arg =
+ lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment,
+ Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
@@ -2221,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
- if (!IsEntryFunc) {
+ if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
// Special inputs come after user arguments.
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -2231,8 +2340,6 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
- CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
- CCInfo.AllocateReg(Info->getFrameOffsetReg());
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -2442,50 +2549,51 @@ void SITargetLowering::passSpecialInputs(
SDValue Chain) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
- if (!CLI.CS)
+ if (!CLI.CB)
return;
- const Function *CalleeFunc = CLI.CS.getCalledFunction();
- assert(CalleeFunc);
-
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
-
- auto &ArgUsageInfo =
- DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- const AMDGPUFunctionArgInfo &CalleeArgInfo
- = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
-
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
+ const AMDGPUFunctionArgInfo *CalleeArgInfo
+ = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
+ if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
+ auto &ArgUsageInfo =
+ DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+ CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+ }
+
// TODO: Unify with private memory register handling. This is complicated by
// the fact that at least in kernels, the input argument is not necessarily
// in the same location as the input.
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
AMDGPUFunctionArgInfo::DISPATCH_PTR,
AMDGPUFunctionArgInfo::QUEUE_PTR,
- AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
AMDGPUFunctionArgInfo::DISPATCH_ID,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
- AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
};
for (auto InputID : InputRegs) {
const ArgDescriptor *OutgoingArg;
const TargetRegisterClass *ArgRC;
+ LLT ArgTy;
- std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
+ std::tie(OutgoingArg, ArgRC, ArgTy) =
+ CalleeArgInfo->getPreloadedValue(InputID);
if (!OutgoingArg)
continue;
const ArgDescriptor *IncomingArg;
const TargetRegisterClass *IncomingArgRC;
- std::tie(IncomingArg, IncomingArgRC)
- = CallerArgInfo.getPreloadedValue(InputID);
+ LLT Ty;
+ std::tie(IncomingArg, IncomingArgRC, Ty) =
+ CallerArgInfo.getPreloadedValue(InputID);
assert(IncomingArgRC == ArgRC);
// All special arguments are ints for now.
@@ -2503,8 +2611,11 @@ void SITargetLowering::passSpecialInputs(
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+ report_fatal_error("failed to allocate implicit input argument");
} else {
- unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
+ unsigned SpecialArgOffset =
+ CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
SpecialArgOffset);
MemOpChains.push_back(ArgStore);
@@ -2515,33 +2626,34 @@ void SITargetLowering::passSpecialInputs(
// packed.
const ArgDescriptor *OutgoingArg;
const TargetRegisterClass *ArgRC;
+ LLT Ty;
- std::tie(OutgoingArg, ArgRC) =
- CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ std::tie(OutgoingArg, ArgRC, Ty) =
+ CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
if (!OutgoingArg)
- std::tie(OutgoingArg, ArgRC) =
- CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ std::tie(OutgoingArg, ArgRC, Ty) =
+ CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
if (!OutgoingArg)
- std::tie(OutgoingArg, ArgRC) =
- CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ std::tie(OutgoingArg, ArgRC, Ty) =
+ CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
if (!OutgoingArg)
return;
- const ArgDescriptor *IncomingArgX
- = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
- const ArgDescriptor *IncomingArgY
- = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
- const ArgDescriptor *IncomingArgZ
- = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
+ const ArgDescriptor *IncomingArgX = std::get<0>(
+ CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
+ const ArgDescriptor *IncomingArgY = std::get<0>(
+ CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
+ const ArgDescriptor *IncomingArgZ = std::get<0>(
+ CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
SDValue InputReg;
SDLoc SL;
// If incoming ids are not packed we need to pack them.
- if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
+ if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
- if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
+ if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2549,7 +2661,7 @@ void SITargetLowering::passSpecialInputs(
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
}
- if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
+ if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
DAG.getShiftAmountConstant(20, MVT::i32, SL));
@@ -2569,8 +2681,9 @@ void SITargetLowering::passSpecialInputs(
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ CCInfo.AllocateReg(OutgoingArg->getRegister());
} else {
- unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
SpecialArgOffset);
MemOpChains.push_back(ArgStore);
@@ -2703,10 +2816,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported call to variadic function ");
}
- if (!CLI.CS.getInstruction())
+ if (!CLI.CB)
report_fatal_error("unsupported libcall legalization");
- if (!CLI.CS.getCalledFunction()) {
+ if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ !CLI.CB->getCalledFunction()) {
return lowerUnhandledCall(CLI, InVals,
"unsupported indirect call to function ");
}
@@ -2726,7 +2840,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
- if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
+ if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
}
@@ -2743,12 +2857,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ // With a fixed ABI, allocate fixed registers before user arguments.
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+ }
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@@ -2767,7 +2888,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// arguments to begin at SP+0. Completely unused for non-tail calls.
int32_t FPDiff = 0;
MachineFrameInfo &MFI = MF.getFrameInfo();
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
@@ -2784,7 +2904,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
- SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
// Walk the register/memloc assignments, inserting copies/loads.
@@ -2837,7 +2956,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// FIXME: We can have better than the minimum byval required alignment.
Alignment =
Flags.isByVal()
- ? MaybeAlign(Flags.getByValAlign())
+ ? Flags.getNonZeroByValAlign()
: commonAlignment(Subtarget->getStackAlignment(), Offset);
Offset = Offset + FPDiff;
@@ -2864,11 +2983,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
- SDValue Cpy = DAG.getMemcpy(
- Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
- /*isVol = */ false, /*AlwaysInline = */ true,
- /*isTailCall = */ false, DstInfo,
- MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
+ SDValue Cpy =
+ DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
+ Outs[i].Flags.getNonZeroByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ true,
+ /*isTailCall = */ false, DstInfo,
+ MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
MemOpChains.push_back(Cpy);
} else {
@@ -2879,8 +2999,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
- // Copy special input registers after user input arguments.
- passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+ if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ // Copy special input registers after user input arguments.
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+ }
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@@ -2927,9 +3049,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Ops.push_back(Callee);
// Add a redundant copy of the callee global which will not be legalized, as
// we need direct access to the callee later.
- GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
- const GlobalValue *GV = GSD->getGlobal();
- Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
+ if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = GSD->getGlobal();
+ Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
+ } else {
+ Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+ }
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
@@ -2985,6 +3110,71 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
IsThisReturn ? OutVals[0] : SDValue());
}
+// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for applying the wave size scale to the increment amount.
+SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
+ SDValue Op, SelectionDAG &DAG) const {
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ SDValue Tmp1 = Op;
+ SDValue Tmp2 = Op.getValue(1);
+ SDValue Tmp3 = Op.getOperand(2);
+ SDValue Chain = Tmp1.getOperand(0);
+
+ Register SPReg = Info->getStackPtrOffsetReg();
+
+ // Chain the dynamic stack allocation so that it doesn't modify the stack
+ // pointer when other instructions are using the stack.
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+ SDValue Size = Tmp2.getOperand(1);
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const TargetFrameLowering *TFL = ST.getFrameLowering();
+ unsigned Opc =
+ TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+ ISD::ADD : ISD::SUB;
+
+ SDValue ScaledSize = DAG.getNode(
+ ISD::SHL, dl, VT, Size,
+ DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
+
+ Align StackAlign = TFL->getStackAlign();
+ Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
+ if (Alignment && *Alignment > StackAlign) {
+ Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+ DAG.getConstant(-(uint64_t)Alignment->value()
+ << ST.getWavefrontSizeLog2(),
+ dl, VT));
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+ Tmp2 = DAG.getCALLSEQ_END(
+ Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+ return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+}
+
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ // We only handle constant sizes here to allow non-entry block, static sized
+ // allocas. A truly dynamic value is more difficult to support because we
+ // don't know if the size value is uniform or not. If the size isn't uniform,
+ // we would need to do a wave reduction to get the maximum size to know how
+ // much to increment the uniform stack pointer.
+ SDValue Size = Op.getOperand(1);
+ if (isa<ConstantSDNode>(Size))
+ return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
+
+ return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+}
+
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
@@ -3310,9 +3500,15 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
Offset, UseGPRIdxMode, IsIndirectSrc);
-
- MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec)
+ MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(LoopBB);
+ ++MBBI;
+ MF->insert(MBBI, LandingPad);
+ LoopBB->removeSuccessor(RemainderBB);
+ LandingPad->addSuccessor(RemainderBB);
+ LoopBB->addSuccessor(LandingPad);
+ MachineBasicBlock::iterator First = LandingPad->begin();
+ BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
.addReg(SaveExec);
return InsPt;
@@ -3331,7 +3527,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
if (Offset >= NumElts || Offset < 0)
return std::make_pair(AMDGPU::sub0, Offset);
- return std::make_pair(AMDGPU::sub0 + Offset, 0);
+ return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
}
// Return true if the index is an SGPR and was set.
@@ -3465,24 +3661,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
return LoopBB;
}
-static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
- const TargetRegisterClass *VecRC) {
- switch (TRI.getRegSizeInBits(*VecRC)) {
- case 32: // 4 bytes
- return AMDGPU::V_MOVRELD_B32_V1;
- case 64: // 8 bytes
- return AMDGPU::V_MOVRELD_B32_V2;
- case 128: // 16 bytes
- return AMDGPU::V_MOVRELD_B32_V4;
- case 256: // 32 bytes
- return AMDGPU::V_MOVRELD_B32_V8;
- case 512: // 64 bytes
- return AMDGPU::V_MOVRELD_B32_V16;
- default:
- llvm_unreachable("unsupported size for MOVRELD pseudos");
- }
-}
-
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineBasicBlock &MBB,
const GCNSubtarget &ST) {
@@ -3522,28 +3700,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return &MBB;
}
+ const MCInstrDesc &MovRelDesc
+ = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false);
+
if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
-
- if (UseGPRIdxMode) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
- .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
- .add(*Val)
- .addReg(Dst, RegState::ImplicitDefine)
- .addReg(SrcVec->getReg(), RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
-
+ BuildMI(MBB, I, DL, MovRelDesc, Dst)
+ .addReg(SrcVec->getReg())
+ .add(*Val)
+ .addImm(SubReg);
+ if (UseGPRIdxMode)
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- } else {
- const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
-
- BuildMI(MBB, I, DL, MovRelDesc)
- .addReg(Dst, RegState::Define)
- .addReg(SrcVec->getReg())
- .add(*Val)
- .addImm(SubReg - AMDGPU::sub0);
- }
MI.eraseFromParent();
return &MBB;
@@ -3560,26 +3728,14 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
Offset, UseGPRIdxMode, false);
MachineBasicBlock *LoopBB = InsPt->getParent();
- if (UseGPRIdxMode) {
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
- .addReg(PhiReg, RegState::Undef, SubReg) // vdst
- .add(*Val) // src0
- .addReg(Dst, RegState::ImplicitDefine)
- .addReg(PhiReg, RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
+ .addReg(PhiReg)
+ .add(*Val)
+ .addImm(AMDGPU::sub0);
+ if (UseGPRIdxMode)
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- } else {
- const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
-
- BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
- .addReg(Dst, RegState::Define)
- .addReg(PhiReg)
- .add(*Val)
- .addImm(SubReg - AMDGPU::sub0);
- }
MI.eraseFromParent();
-
return LoopBB;
}
@@ -3590,17 +3746,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineFunction *MF = BB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- if (TII->isMIMG(MI)) {
- if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
- report_fatal_error("missing mem operand from MIMG instruction");
- }
- // Add a memoperand for mimg instructions so that they aren't assumed to
- // be ordered memory instuctions.
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_UADDO_PSEUDO:
+ case AMDGPU::S_USUBO_PSEUDO: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineOperand &Dest0 = MI.getOperand(0);
+ MachineOperand &Dest1 = MI.getOperand(1);
+ MachineOperand &Src0 = MI.getOperand(2);
+ MachineOperand &Src1 = MI.getOperand(3);
+
+ unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+ ? AMDGPU::S_ADD_I32
+ : AMDGPU::S_SUB_I32;
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
+
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
+ .addImm(1)
+ .addImm(0);
+ MI.eraseFromParent();
return BB;
}
-
- switch (MI.getOpcode()) {
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
@@ -3616,35 +3782,150 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src0, BoolRC, AMDGPU::sub0,
- &AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src0, BoolRC, AMDGPU::sub1,
- &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src1, BoolRC, AMDGPU::sub0,
- &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
- Src1, BoolRC, AMDGPU::sub1,
- &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
- .add(Src0Sub0)
- .add(Src1Sub0);
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
- .add(Src0Sub1)
- .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::V_ADD_U64_PSEUDO:
+ case AMDGPU::V_SUB_U64_PSEUDO: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
+
+ const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
+
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &Src0 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(2);
+
+ const TargetRegisterClass *Src0RC = Src0.isReg()
+ ? MRI.getRegClass(Src0.getReg())
+ : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *Src1RC = Src1.isReg()
+ ? MRI.getRegClass(Src1.getReg())
+ : &AMDGPU::VReg_64RegClass;
+
+ const TargetRegisterClass *Src0SubRC =
+ TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
+
+ MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .addReg(CarryReg, RegState::Define)
+ .add(SrcReg0Sub0)
+ .add(SrcReg1Sub0)
+ .addImm(0); // clamp bit
+
+ unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
+ MachineInstr *HiHalf =
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+ .add(SrcReg0Sub1)
+ .add(SrcReg1Sub1)
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
+
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ TII->legalizeOperands(*LoHalf);
+ TII->legalizeOperands(*HiHalf);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::S_ADD_CO_PSEUDO:
+ case AMDGPU::S_SUB_CO_PSEUDO: {
+ // This pseudo has a chance to be selected
+ // only from uniform add/subcarry node. All the VGPR operands
+ // therefore assumed to be splat vectors.
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineBasicBlock::iterator MII = MI;
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineOperand &Dest = MI.getOperand(0);
+ MachineOperand &CarryDest = MI.getOperand(1);
+ MachineOperand &Src0 = MI.getOperand(2);
+ MachineOperand &Src1 = MI.getOperand(3);
+ MachineOperand &Src2 = MI.getOperand(4);
+ unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+ ? AMDGPU::S_ADDC_U32
+ : AMDGPU::S_SUBB_U32;
+ if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
+ Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
+ .addReg(Src0.getReg());
+ Src0.setReg(RegOp0);
+ }
+ if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
+ Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
+ .addReg(Src1.getReg());
+ Src1.setReg(RegOp1);
+ }
+ Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ if (TRI->isVectorRegister(MRI, Src2.getReg())) {
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
+ .addReg(Src2.getReg());
+ Src2.setReg(RegOp2);
+ }
+
+ if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) {
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
+ .addReg(Src2.getReg())
+ .addImm(0);
+ } else {
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
+ .addReg(Src2.getReg())
+ .addImm(0);
+ }
+
+ BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
+
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg())
+ .addReg(AMDGPU::SCC);
MI.eraseFromParent();
return BB;
}
@@ -3741,12 +4022,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::SI_INDIRECT_SRC_V4:
case AMDGPU::SI_INDIRECT_SRC_V8:
case AMDGPU::SI_INDIRECT_SRC_V16:
+ case AMDGPU::SI_INDIRECT_SRC_V32:
return emitIndirectSrc(MI, *BB, *getSubtarget());
case AMDGPU::SI_INDIRECT_DST_V1:
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
+ case AMDGPU::SI_INDIRECT_DST_V32:
return emitIndirectDst(MI, *BB, *getSubtarget());
case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
case AMDGPU::SI_KILL_I1_PSEUDO:
@@ -3870,6 +4153,75 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
return emitGWSMemViolTestLoop(MI, BB);
+ case AMDGPU::S_SETREG_B32: {
+ if (!getSubtarget()->hasDenormModeInst())
+ return BB;
+
+ // Try to optimize cases that only set the denormal mode or rounding mode.
+ //
+ // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
+ // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
+ // instead.
+ //
+ // FIXME: This could be predicates on the immediate, but tablegen doesn't
+ // allow you to have a no side effect instruction in the output of a
+ // sideeffecting pattern.
+
+ // TODO: Should also emit a no side effects pseudo if only FP bits are
+ // touched, even if not all of them or to a variable.
+ unsigned ID, Offset, Width;
+ AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+ if (ID != AMDGPU::Hwreg::ID_MODE)
+ return BB;
+
+ const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
+ const unsigned SetMask = WidthMask << Offset;
+ unsigned SetDenormOp = 0;
+ unsigned SetRoundOp = 0;
+
+ // The dedicated instructions can only set the whole denorm or round mode at
+ // once, not a subset of bits in either.
+ if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
+ AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
+ // If this fully sets both the round and denorm mode, emit the two
+ // dedicated instructions for these.
+ assert(Offset == 0);
+ SetRoundOp = AMDGPU::S_ROUND_MODE;
+ SetDenormOp = AMDGPU::S_DENORM_MODE;
+ } else if (Width == 4) {
+ if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
+ SetRoundOp = AMDGPU::S_ROUND_MODE;
+ assert(Offset == 0);
+ } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
+ SetDenormOp = AMDGPU::S_DENORM_MODE;
+ assert(Offset == 4);
+ }
+ }
+
+ if (SetRoundOp || SetDenormOp) {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
+ if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
+ unsigned ImmVal = Def->getOperand(1).getImm();
+ if (SetRoundOp) {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
+ .addImm(ImmVal & 0xf);
+
+ // If we also have the denorm mode, get just the denorm mode bits.
+ ImmVal >>= 4;
+ }
+
+ if (SetDenormOp) {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
+ .addImm(ImmVal & 0xf);
+ }
+
+ MI.eraseFromParent();
+ }
+ }
+
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@@ -3925,10 +4277,13 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32: {
- // This is as fast on some subtargets. However, we always have full rate f32
- // mad available which returns the same result as the separate operations
- // which we should prefer over fma. We can't use this if we want to support
- // denormals, so only report this in these cases.
+ // If mad is not available this depends only on if f32 fma is full rate.
+ if (!Subtarget->hasMadMacF32Insts())
+ return Subtarget->hasFastFMAF32();
+
+ // Otherwise f32 mad is always full rate and returns the same result as
+ // the separate operations so should be preferred over fma.
+ // However does not support denomals.
if (hasFP32Denormals(MF))
return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
@@ -3946,13 +4301,14 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
return false;
}
-bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
- const SDNode *N) const {
+bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
+ const SDNode *N) const {
// TODO: Check future ftz flag
// v_mad_f32/v_mac_f32 do not support denormals.
EVT VT = N->getValueType(0);
if (VT == MVT::f32)
- return !hasFP32Denormals(DAG.getMachineFunction());
+ return Subtarget->hasMadMacF32Insts() &&
+ !hasFP32Denormals(DAG.getMachineFunction());
if (VT == MVT::f16) {
return Subtarget->hasMadF16() &&
!hasFP64FP16Denormals(DAG.getMachineFunction());
@@ -3971,7 +4327,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4f16);
+ assert(VT == MVT::v4f16 || VT == MVT::v4i16);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4080,6 +4436,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
+ case ISD::BSWAP:
return splitUnaryVectorOp(Op, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM:
@@ -4101,6 +4458,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);
+ case ISD::SMULO:
+ case ISD::UMULO:
+ return lowerXMULO(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return SDValue();
}
@@ -4204,9 +4566,8 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
- int CondCode = CD->getSExtValue();
- if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
- CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
+ unsigned CondCode = CD->getZExtValue();
+ if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
return DAG.getUNDEF(VT);
ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
@@ -4241,11 +4602,9 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
EVT VT = N->getValueType(0);
const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
- int CondCode = CD->getSExtValue();
- if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
- CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
+ unsigned CondCode = CD->getZExtValue();
+ if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
return DAG.getUNDEF(VT);
- }
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
@@ -4268,6 +4627,43 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
return DAG.getZExtOrTrunc(SetCC, SL, VT);
}
+static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(1);
+ SDLoc SL(N);
+
+ if (Src.getOpcode() == ISD::SETCC) {
+ // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
+ return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
+ Src.getOperand(1), Src.getOperand(2));
+ }
+ if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
+ // (ballot 0) -> 0
+ if (Arg->isNullValue())
+ return DAG.getConstant(0, SL, VT);
+
+ // (ballot 1) -> EXEC/EXEC_LO
+ if (Arg->isOne()) {
+ Register Exec;
+ if (VT.getScalarSizeInBits() == 32)
+ Exec = AMDGPU::EXEC_LO;
+ else if (VT.getScalarSizeInBits() == 64)
+ Exec = AMDGPU::EXEC;
+ else
+ return SDValue();
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
+ }
+ }
+
+ // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
+ // ISD::SETNE)
+ return DAG.getNode(
+ AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
+ DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -4440,9 +4836,7 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
// FIXME: Either avoid relying on address space here or change the default
// address space for functions to avoid the explicit check.
return (GV->getValueType()->isFunctionTy() ||
- GV->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
- GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
- GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@@ -4451,6 +4845,14 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
}
+bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
+ if (!GV->hasExternalLinkage())
+ return true;
+
+ const auto OS = getTargetMachine().getTargetTriple().getOS();
+ return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
+}
+
/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -4470,16 +4872,10 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
} else {
// Get the target from BR if we don't negate the condition
BR = findUser(BRCOND, ISD::BR);
+ assert(BR && "brcond missing unconditional branch user");
Target = BR->getOperand(1);
}
- // FIXME: This changes the types of the intrinsics instead of introducing new
- // nodes with the correct types.
- // e.g. llvm.amdgcn.loop
-
- // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
- // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
-
unsigned CFNode = isCFIntrinsic(Intr);
if (CFNode == 0) {
// This is a uniform branch so we don't need to legalize.
@@ -4524,7 +4920,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
};
SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
- BR = NewBR.getNode();
}
SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -4577,13 +4972,14 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}
-SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
+SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
SDValue Op,
const SDLoc &DL,
EVT VT) const {
return Op.getValueType().bitsLE(VT) ?
DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
- DAG.getNode(ISD::FTRUNC, DL, VT, Op);
+ DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
+ DAG.getTargetConstant(0, DL, MVT::i32));
}
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -4609,7 +5005,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
bool IsIEEEMode = Info->getMode().IEEE;
- // FIXME: Assert during eslection that this is only selected for
+ // FIXME: Assert during selection that this is only selected for
// ieee_mode. Currently a combine can produce the ieee version for non-ieee
// mode functions, but this happens to be OK since it's only done in cases
// where there is known no sNaN.
@@ -4621,6 +5017,42 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
return Op;
}
+SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc SL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ bool isSigned = Op.getOpcode() == ISD::SMULO;
+
+ if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
+ const APInt &C = RHSC->getAPIntValue();
+ // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
+ if (C.isPowerOf2()) {
+ // smulo(x, signed_min) is same as umulo(x, signed_min).
+ bool UseArithShift = isSigned && !C.isMinSignedValue();
+ SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
+ SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
+ SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
+ DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
+ SL, VT, Result, ShiftAmt),
+ LHS, ISD::SETNE);
+ return DAG.getMergeValues({ Result, Overflow }, SL);
+ }
+ }
+
+ SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
+ SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
+ SL, VT, LHS, RHS);
+
+ SDValue Sign = isSigned
+ ? DAG.getNode(ISD::SRA, SL, VT, Result,
+ DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
+ : DAG.getConstant(0, SL, VT);
+ SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
+
+ return DAG.getMergeValues({ Result, Overflow }, SL);
+}
+
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
@@ -4694,7 +5126,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ Register UserSGPR = Info->getQueuePtrUserSGPR();
assert(UserSGPR != AMDGPU::NoRegister);
SDValue QueuePtr = CreateLiveInRegister(
@@ -4765,6 +5197,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
}
}
+ if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+ Src.getValueType() == MVT::i64)
+ return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
// global <-> flat are no-ops and never emitted.
const MachineFunction &MF = DAG.getMachineFunction();
@@ -5036,8 +5472,9 @@ SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
static SDValue
buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
- const SDLoc &DL, unsigned Offset, EVT PtrVT,
+ const SDLoc &DL, int64_t Offset, EVT PtrVT,
unsigned GAFlags = SIInstrInfo::MO_NONE) {
+ assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
// lowered to the following code sequence:
//
@@ -5086,9 +5523,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();
if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
- (!GV->hasExternalLinkage() ||
- getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
- getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
+ shouldUseLDSConstAddress(GV)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -5114,11 +5549,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
const DataLayout &DataLayout = DAG.getDataLayout();
- unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+ Align Alignment = DataLayout.getABITypeAlign(PtrTy);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getGOT(DAG.getMachineFunction());
- return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+ return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
}
@@ -5144,8 +5579,8 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
MVT VT,
unsigned Offset) const {
SDLoc SL(Op);
- SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
- DAG.getEntryNode(), Offset, 4, false);
+ SDValue Param = lowerKernargMemParameter(
+ DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
// The local size values will have the hi 16-bits as zero.
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
DAG.getValueType(VT));
@@ -5181,6 +5616,9 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
} else if (Elts.size() == 2) {
Type = MVT::v2f32;
NumElts = 2;
+ } else if (Elts.size() == 3) {
+ Type = MVT::v3f32;
+ NumElts = 3;
} else if (Elts.size() <= 4) {
Type = MVT::v4f32;
NumElts = 4;
@@ -5230,6 +5668,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
return Value == 0;
}
+static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
+ SDValue Src, int ExtraElts) {
+ EVT SrcVT = Src.getValueType();
+
+ SmallVector<SDValue, 8> Elts;
+
+ if (SrcVT.isVector())
+ DAG.ExtractVectorElements(Src, Elts);
+ else
+ Elts.push_back(Src);
+
+ SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
+ while (ExtraElts--)
+ Elts.push_back(Undef);
+
+ return DAG.getBuildVector(CastVT, DL, Elts);
+}
+
// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
@@ -5241,76 +5697,56 @@ static SDValue constructRetValue(SelectionDAG &DAG,
const SDLoc &DL, LLVMContext &Context) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
- EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
- EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
- EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
- : AdjEltVT
- : ReqRetVT;
-
- // Extract data part of the result
- // Bitcast the result to the same type as the required return type
- int NumElts;
- if (IsD16 && !Unpacked)
- NumElts = NumVDataDwords << 1;
- else
- NumElts = NumVDataDwords;
+ int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+ ReqRetNumElts : (ReqRetNumElts + 1) / 2;
- EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
- : AdjEltVT;
+ int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+ DMaskPop : (DMaskPop + 1) / 2;
- // Special case for v6f16. Rather than add support for this, use v3i32 to
- // extract the data elements
- bool V6F16Special = false;
- if (NumElts == 6) {
- CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
- DMaskPop >>= 1;
- ReqRetNumElts >>= 1;
- V6F16Special = true;
- AdjVT = MVT::v2i32;
- }
+ MVT DataDwordVT = NumDataDwords == 1 ?
+ MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
- SDValue N = SDValue(Result, 0);
- SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+ MVT MaskPopVT = MaskPopDwords == 1 ?
+ MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
- // Iterate over the result
- SmallVector<SDValue, 4> BVElts;
+ SDValue Data(Result, 0);
+ SDValue TexFail;
- if (CastVT.isVector()) {
- DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
- } else {
- BVElts.push_back(CastRes);
- }
- int ExtraElts = ReqRetNumElts - DMaskPop;
- while(ExtraElts--)
- BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+ if (IsTexFail) {
+ SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
+ if (MaskPopVT.isVector()) {
+ Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
+ SDValue(Result, 0), ZeroIdx);
+ } else {
+ Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
+ SDValue(Result, 0), ZeroIdx);
+ }
- SDValue PreTFCRes;
- if (ReqRetNumElts > 1) {
- SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
- if (IsD16 && Unpacked)
- PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
- else
- PreTFCRes = NewVec;
- } else {
- PreTFCRes = BVElts[0];
+ TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ SDValue(Result, 0),
+ DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}
- if (V6F16Special)
- PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+ if (DataDwordVT.isVector())
+ Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
+ NumDataDwords - MaskPopDwords);
- if (!IsTexFail) {
- if (Result->getNumValues() > 1)
- return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
- else
- return PreTFCRes;
- }
+ if (IsD16)
+ Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
+
+ if (!ReqRetVT.isVector())
+ Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+
+ Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
- // Extract the TexFail result and insert into aggregate return
- SmallVector<SDValue, 1> TFCElt;
- DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
- SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
- return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+ if (TexFail)
+ return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+
+ if (Result->getNumValues() == 1)
+ return Data;
+
+ return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
}
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
@@ -5331,6 +5767,35 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
return Value == 0;
}
+static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
+ MVT PackVectorVT,
+ SmallVectorImpl<SDValue> &PackedAddrs,
+ unsigned DimIdx, unsigned EndIdx,
+ unsigned NumGradients) {
+ SDLoc DL(Op);
+ for (unsigned I = DimIdx; I < EndIdx; I++) {
+ SDValue Addr = Op.getOperand(I);
+
+ // Gradients are packed with undef for each coordinate.
+ // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
+ // 1D: undef,dx/dh; undef,dx/dv
+ // 2D: dy/dh,dx/dh; dy/dv,dx/dv
+ // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
+ if (((I + 1) >= EndIdx) ||
+ ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
+ I == DimIdx + NumGradients - 1))) {
+ if (Addr.getValueType() != MVT::i16)
+ Addr = DAG.getBitcast(MVT::i16, Addr);
+ Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
+ } else {
+ Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
+ I++;
+ }
+ Addr = DAG.getBitcast(MVT::f32, Addr);
+ PackedAddrs.push_back(Addr);
+ }
+}
+
SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const {
@@ -5350,6 +5815,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
bool IsD16 = false;
+ bool IsG16 = false;
bool IsA16 = false;
SDValue VData;
int NumVDataDwords;
@@ -5456,41 +5922,67 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
}
- // Check for 16 bit addresses and pack if true.
+ // Push back extra arguments.
+ for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++)
+ VAddrs.push_back(Op.getOperand(AddrIdx + I));
+
+ // Check for 16 bit addresses or derivatives and pack if true.
unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
+ unsigned CoordIdx = DimIdx + NumGradients;
+ unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
+
MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
- const MVT VAddrScalarVT = VAddrVT.getScalarType();
- if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
- ST->hasFeature(AMDGPU::FeatureR128A16)) {
- IsA16 = true;
- const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
- for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
- SDValue AddrLo, AddrHi;
- // Push back extra arguments.
- if (i < DimIdx) {
- AddrLo = Op.getOperand(i);
- } else {
- AddrLo = Op.getOperand(i);
- // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
- // in 1D, derivatives dx/dh and dx/dv are packed with undef.
- if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
- ((NumGradients / 2) % 2 == 1 &&
- (i == DimIdx + (NumGradients / 2) - 1 ||
- i == DimIdx + NumGradients - 1))) {
- AddrHi = DAG.getUNDEF(MVT::f16);
- } else {
- AddrHi = Op.getOperand(i + 1);
- i++;
- }
- AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
- {AddrLo, AddrHi});
- AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
+ MVT VAddrScalarVT = VAddrVT.getScalarType();
+ MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+ IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
+
+ VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
+ VAddrScalarVT = VAddrVT.getScalarType();
+ IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
+ if (IsA16 || IsG16) {
+ if (IsA16) {
+ if (!ST->hasA16()) {
+ LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+ "support 16 bit addresses\n");
+ return Op;
+ }
+ if (!IsG16) {
+ LLVM_DEBUG(
+ dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
+ "need 16 bit derivatives but got 32 bit derivatives\n");
+ return Op;
}
- VAddrs.push_back(AddrLo);
+ } else if (!ST->hasG16()) {
+ LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+ "support 16 bit derivatives\n");
+ return Op;
+ }
+
+ if (BaseOpcode->Gradients && !IsA16) {
+ if (!ST->hasG16()) {
+ LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+ "support 16 bit derivatives\n");
+ return Op;
+ }
+ // Activate g16
+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+ IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
+ }
+
+ // Don't compress addresses for G16
+ const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx;
+ packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx,
+ PackEndIdx, NumGradients);
+
+ if (!IsA16) {
+ // Add uncompressed address
+ for (unsigned I = CoordIdx; I < CoordsEnd; I++)
+ VAddrs.push_back(Op.getOperand(I));
}
} else {
- for (unsigned i = 0; i < NumMIVAddrs; ++i)
- VAddrs.push_back(Op.getOperand(AddrIdx + i));
+ for (unsigned I = DimIdx; I < CoordsEnd; I++)
+ VAddrs.push_back(Op.getOperand(I));
}
// If the register allocator cannot place the address registers contiguously
@@ -5557,8 +6049,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
EVT NewVT = NumVDataDwords > 1 ?
- EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
- : MVT::f32;
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
+ : MVT::i32;
ResultTypes[0] = NewVT;
if (ResultTypes.size() == 3) {
@@ -5603,10 +6095,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Ops.push_back(DLC);
Ops.push_back(GLC);
Ops.push_back(SLC);
- Ops.push_back(IsA16 && // a16 or r128
+ Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
- Ops.push_back(TFE); // tfe
- Ops.push_back(LWE); // lwe
+ if (IsGFX10)
+ Ops.push_back(IsA16 ? True : False);
+ Ops.push_back(TFE);
+ Ops.push_back(LWE);
if (!IsGFX10)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
@@ -5655,26 +6149,25 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
- SDValue Offset, SDValue GLC, SDValue DLC,
+ SDValue Offset, SDValue CachePolicy,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const DataLayout &DataLayout = DAG.getDataLayout();
- unsigned Align =
- DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+ Align Alignment =
+ DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- VT.getStoreSize(), Align);
+ VT.getStoreSize(), Alignment);
if (!Offset->isDivergent()) {
SDValue Ops[] = {
Rsrc,
Offset, // Offset
- GLC,
- DLC,
+ CachePolicy
};
// Widen vec3 load to vec4.
@@ -5684,9 +6177,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
auto WidenedOp = DAG.getMemIntrinsicNode(
AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
- auto Subvector = DAG.getNode(
- ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
- DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
+ DAG.getVectorIdxConstant(0, DL));
return Subvector;
}
@@ -5705,11 +6197,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
if (NumElts == 8 || NumElts == 16) {
NumLoads = NumElts / 4;
- LoadVT = MVT::v4i32;
+ LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
}
SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
- unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
SDValue Ops[] = {
DAG.getEntryNode(), // Chain
Rsrc, // rsrc
@@ -5717,13 +6208,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
{}, // voffset
{}, // soffset
{}, // offset
- DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+ CachePolicy, // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
- setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+ setBufferOffsets(Offset, DAG, &Ops[3],
+ NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
for (unsigned i = 0; i < NumLoads; ++i) {
@@ -5732,7 +6224,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
LoadVT, MMO, DAG));
}
- if (VT == MVT::v8i32 || VT == MVT::v16i32)
+ if (NumElts == 8 || NumElts == 16)
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
return Loads[0];
@@ -5777,6 +6269,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
+ // This only makes sense to call in a kernel, so just lower to null.
+ return DAG.getConstant(0, DL, VT);
+ }
+
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
@@ -5790,8 +6287,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_rsq_legacy:
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
-
- return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+ return SDValue();
case Intrinsic::amdgcn_rcp_legacy:
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
@@ -5815,37 +6311,43 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_X, 4, false);
+ SI::KernelInputOffsets::NGROUPS_X, Align(4),
+ false);
case Intrinsic::r600_read_ngroups_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Y, 4, false);
+ SI::KernelInputOffsets::NGROUPS_Y, Align(4),
+ false);
case Intrinsic::r600_read_ngroups_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Z, 4, false);
+ SI::KernelInputOffsets::NGROUPS_Z, Align(4),
+ false);
case Intrinsic::r600_read_global_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_X,
+ Align(4), false);
case Intrinsic::r600_read_global_size_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Y,
+ Align(4), false);
case Intrinsic::r600_read_global_size_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Z,
+ Align(4), false);
case Intrinsic::r600_read_local_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -5865,29 +6367,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return lowerImplicitZextParam(DAG, Op, MVT::i16,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
- case Intrinsic::r600_read_tgid_x:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
- case Intrinsic::r600_read_tgid_y:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
- case Intrinsic::r600_read_tgid_z:
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
case Intrinsic::amdgcn_workitem_id_x:
- case Intrinsic::r600_read_tidig_x:
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDX);
case Intrinsic::amdgcn_workitem_id_y:
- case Intrinsic::r600_read_tidig_y:
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDY);
case Intrinsic::amdgcn_workitem_id_z:
- case Intrinsic::r600_read_tidig_z:
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
SDLoc(DAG.getEntryNode()),
MFI->getArgInfo().WorkItemIDZ);
@@ -5901,53 +6397,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
IsGFX10 ? &DLC : nullptr))
return Op;
- return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC,
+ return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
- case Intrinsic::amdgcn_interp_p1_f16: {
- SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
- Op.getOperand(5), SDValue());
- if (getSubtarget()->getLDSBankCount() == 16) {
- // 16 bank LDS
-
- // FIXME: This implicitly will insert a second CopyToReg to M0.
- SDValue S = DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
- DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32),
- DAG.getConstant(2, DL, MVT::i32), // P0
- Op.getOperand(2), // Attrchan
- Op.getOperand(3), // Attr
- Op.getOperand(5)); // m0
-
- SDValue Ops[] = {
- Op.getOperand(1), // Src0
- Op.getOperand(2), // Attrchan
- Op.getOperand(3), // Attr
- DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
- S, // Src2 - holds two f16 values selected by high
- DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
- Op.getOperand(4), // high
- DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
- DAG.getTargetConstant(0, DL, MVT::i32) // $omod
- };
- return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
- } else {
- // 32 bank LDS
- SDValue Ops[] = {
- Op.getOperand(1), // Src0
- Op.getOperand(2), // Attrchan
- Op.getOperand(3), // Attr
- DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
- Op.getOperand(4), // high
- DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
- DAG.getTargetConstant(0, DL, MVT::i32), // $omod
- ToM0.getValue(1)
- };
- return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
- }
- }
case Intrinsic::amdgcn_sin:
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
@@ -5988,9 +6442,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::amdgcn_trig_preop:
- return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_div_scale: {
const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
@@ -6020,6 +6471,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fcmp: {
return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
}
+ case Intrinsic::amdgcn_ballot:
+ return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
@@ -6098,6 +6551,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getConstant(1, SL, MVT::i32));
return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
}
+ case Intrinsic::amdgcn_alignbit:
+ return DAG.getNode(ISD::FSHR, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_reloc_constant: {
+ Module *M = const_cast<Module *>(MF.getFunction().getParent());
+ const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
+ auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
+ auto RelocSymbol = cast<GlobalVariable>(
+ M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
+ SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
+ SIInstrInfo::MO_ABS32_LO);
+ return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -6131,6 +6597,28 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset,
cast<ConstantSDNode>(Offset)->getSExtValue();
}
+static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
+ switch (MF.getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_PS:
+ return 1;
+ case CallingConv::AMDGPU_VS:
+ return 2;
+ case CallingConv::AMDGPU_GS:
+ return 3;
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_LS:
+ case CallingConv::AMDGPU_ES:
+ report_fatal_error("ds_ordered_count unsupported for this calling conv");
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::C:
+ case CallingConv::Fast:
+ default:
+ // Assume other calling conventions are various compute callable functions
+ return 0;
+ }
+}
+
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -6146,8 +6634,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned IndexOperand = M->getConstantOperandVal(7);
unsigned WaveRelease = M->getConstantOperandVal(8);
unsigned WaveDone = M->getConstantOperandVal(9);
- unsigned ShaderType;
- unsigned Instruction;
unsigned OrderedCountIndex = IndexOperand & 0x3f;
IndexOperand &= ~0x3f;
@@ -6166,36 +6652,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
if (IndexOperand)
report_fatal_error("ds_ordered_count: bad index operand");
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_ordered_add:
- Instruction = 0;
- break;
- case Intrinsic::amdgcn_ds_ordered_swap:
- Instruction = 1;
- break;
- }
-
if (WaveDone && !WaveRelease)
report_fatal_error("ds_ordered_count: wave_done requires wave_release");
- switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_CS:
- case CallingConv::AMDGPU_KERNEL:
- ShaderType = 0;
- break;
- case CallingConv::AMDGPU_PS:
- ShaderType = 1;
- break;
- case CallingConv::AMDGPU_VS:
- ShaderType = 2;
- break;
- case CallingConv::AMDGPU_GS:
- ShaderType = 3;
- break;
- default:
- report_fatal_error("ds_ordered_count unsupported for this calling conv");
- }
-
+ unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
+ unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction());
unsigned Offset0 = OrderedCountIndex << 2;
unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
(Instruction << 4);
@@ -6425,6 +6886,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_buffer_atomic_sub:
+ case Intrinsic::amdgcn_buffer_atomic_csub:
case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_buffer_atomic_smax:
@@ -6467,6 +6929,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_sub:
Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
break;
+ case Intrinsic::amdgcn_buffer_atomic_csub:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
+ break;
case Intrinsic::amdgcn_buffer_atomic_smin:
Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
break;
@@ -6715,6 +7180,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
+ case Intrinsic::amdgcn_global_atomic_csub: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue Ops[] = {
+ M->getOperand(0), // Chain
+ M->getOperand(2), // Ptr
+ M->getOperand(3) // Value
+ };
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
+ M->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -6750,9 +7227,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
WidenedMemVT, MMO);
if (WidenedVT != VT) {
- auto Extract = DAG.getNode(
- ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
- DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
+ DAG.getVectorIdxConstant(0, DL));
NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
}
return NewOp;
@@ -6792,52 +7268,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
MachineFunction &MF = DAG.getMachineFunction();
switch (IntrinsicID) {
- case Intrinsic::amdgcn_exp: {
- const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
- const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
- const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
- const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
-
- const SDValue Ops[] = {
- Chain,
- DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
- DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
- Op.getOperand(4), // src0
- Op.getOperand(5), // src1
- Op.getOperand(6), // src2
- Op.getOperand(7), // src3
- DAG.getTargetConstant(0, DL, MVT::i1), // compr
- DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
- };
-
- unsigned Opc = Done->isNullValue() ?
- AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
- return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
- }
case Intrinsic::amdgcn_exp_compr: {
- const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
- const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
SDValue Src0 = Op.getOperand(4);
SDValue Src1 = Op.getOperand(5);
- const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
- const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
+ // Hack around illegal type on SI by directly selecting it.
+ if (isTypeLegal(Src0.getValueType()))
+ return SDValue();
+ const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
SDValue Undef = DAG.getUNDEF(MVT::f32);
const SDValue Ops[] = {
- Chain,
- DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
- DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
- DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
- DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
+ Op.getOperand(2), // tgt
+ DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
+ DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
Undef, // src2
Undef, // src3
+ Op.getOperand(7), // vm
DAG.getTargetConstant(1, DL, MVT::i1), // compr
- DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
+ Op.getOperand(3), // en
+ Op.getOperand(0) // Chain
};
- unsigned Opc = Done->isNullValue() ?
- AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
- return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+ unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
+ return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
@@ -7183,13 +7636,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
- SelectionDAG &DAG, SDValue *Offsets,
- unsigned Align) const {
+ SelectionDAG &DAG, SDValue *Offsets,
+ Align Alignment) const {
SDLoc DL(CombinedOffset);
if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
uint32_t SOffset, ImmOffset;
- if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
+ if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget,
+ Alignment)) {
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
@@ -7202,7 +7656,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
uint32_t SOffset, ImmOffset;
int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
- Subtarget, Align)) {
+ Subtarget, Alignment)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
@@ -7413,7 +7867,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
- if (AS == AMDGPUAS::FLAT_ADDRESS)
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
+ !Subtarget->hasMultiDwordFlatScratchAddressing())
AS = MFI->hasFlatScratchInit() ?
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
@@ -7438,7 +7893,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
- !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+ Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
@@ -7547,55 +8002,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
SDValue RHS = Op.getOperand(1);
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
- if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
+ bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
+ Flags.hasApproximateFuncs();
+
+ // Without !fpmath accuracy information, we can't do more because we don't
+ // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+ if (!AllowInaccurateRcp)
return SDValue();
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
- if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
- if (CLHS->isExactlyValue(1.0)) {
- // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
- // the CI documentation has a worst case error of 1 ulp.
- // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
- // use it as long as we aren't trying to use denormals.
- //
- // v_rcp_f16 and v_rsq_f16 DO support denormals.
-
- // 1.0 / sqrt(x) -> rsq(x)
-
- // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
- // error seems really high at 2^29 ULP.
- if (RHS.getOpcode() == ISD::FSQRT)
- return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
- // 1.0 / x -> rcp(x)
- return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- }
+ if (CLHS->isExactlyValue(1.0)) {
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals.
- // Same as for 1.0, but expand the sign out of the constant.
- if (CLHS->isExactlyValue(-1.0)) {
- // -1.0 / x -> rcp (fneg x)
- SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
- return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
- }
+ // 1.0 / sqrt(x) -> rsq(x)
+
+ // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+ // error seems really high at 2^29 ULP.
+ if (RHS.getOpcode() == ISD::FSQRT)
+ return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+ // 1.0 / x -> rcp(x)
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
}
- }
- if (Unsafe) {
- // Turn into multiply by the reciprocal.
- // x / y -> x * (1.0 / y)
- SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
+ // Same as for 1.0, but expand the sign out of the constant.
+ if (CLHS->isExactlyValue(-1.0)) {
+ // -1.0 / x -> rcp (fneg x)
+ SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+ }
}
- return SDValue();
+ // Turn into multiply by the reciprocal.
+ // x / y -> x * (1.0 / y)
+ SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
}
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
- EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+ EVT VT, SDValue A, SDValue B, SDValue GlueChain,
+ SDNodeFlags Flags) {
if (GlueChain->getNumValues() <= 1) {
- return DAG.getNode(Opcode, SL, VT, A, B);
+ return DAG.getNode(Opcode, SL, VT, A, B, Flags);
}
assert(GlueChain->getNumValues() == 3);
@@ -7608,15 +8062,16 @@ static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
break;
}
- return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
- GlueChain.getValue(2));
+ return DAG.getNode(Opcode, SL, VTList,
+ {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
+ Flags);
}
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
EVT VT, SDValue A, SDValue B, SDValue C,
- SDValue GlueChain) {
+ SDValue GlueChain, SDNodeFlags Flags) {
if (GlueChain->getNumValues() <= 1) {
- return DAG.getNode(Opcode, SL, VT, A, B, C);
+ return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
}
assert(GlueChain->getNumValues() == 3);
@@ -7629,8 +8084,9 @@ static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
break;
}
- return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
- GlueChain.getValue(2));
+ return DAG.getNode(Opcode, SL, VTList,
+ {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
+ Flags);
}
SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
@@ -7704,6 +8160,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
return FastLowered;
+ // The selection matcher assumes anything with a chain selecting to a
+ // mayRaiseFPException machine instruction. Since we're introducing a chain
+ // here, we need to explicitly report nofpexcept for the regular fdiv
+ // lowering.
+ SDNodeFlags Flags = Op->getFlags();
+ Flags.setNoFPExcept(true);
+
SDLoc SL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -7713,95 +8176,100 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
- RHS, RHS, LHS);
+ {RHS, RHS, LHS}, Flags);
SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
- LHS, RHS, LHS);
+ {LHS, RHS, LHS}, Flags);
// Denominator is scaled to not be denormal, so using rcp is ok.
SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
- DenominatorScaled);
+ DenominatorScaled, Flags);
SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
- DenominatorScaled);
+ DenominatorScaled, Flags);
const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
(4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
- const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+ const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
if (!HasFP32Denormals) {
+ // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
+ // lowering. The chain dependence is insufficient, and we need glue. We do
+ // not need the glue variants in a strictfp function.
+
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue EnableDenorm;
+ SDNode *EnableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue EnableDenormValue =
getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
- DAG.getEntryNode(), EnableDenormValue);
+ DAG.getEntryNode(), EnableDenormValue).getNode();
} else {
const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
SL, MVT::i32);
- EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
- DAG.getEntryNode(), EnableDenormValue,
- BitField);
+ EnableDenorm =
+ DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+ {EnableDenormValue, BitField, DAG.getEntryNode()});
}
SDValue Ops[3] = {
NegDivScale0,
- EnableDenorm.getValue(0),
- EnableDenorm.getValue(1)
+ SDValue(EnableDenorm, 0),
+ SDValue(EnableDenorm, 1)
};
NegDivScale0 = DAG.getMergeValues(Ops, SL);
}
SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
- ApproxRcp, One, NegDivScale0);
+ ApproxRcp, One, NegDivScale0, Flags);
SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
- ApproxRcp, Fma0);
+ ApproxRcp, Fma0, Flags);
SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
- Fma1, Fma1);
+ Fma1, Fma1, Flags);
SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
- NumeratorScaled, Mul);
+ NumeratorScaled, Mul, Flags);
- SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
+ Fma2, Fma1, Mul, Fma2, Flags);
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
- NumeratorScaled, Fma3);
+ NumeratorScaled, Fma3, Flags);
if (!HasFP32Denormals) {
- SDValue DisableDenorm;
+ SDNode *DisableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue DisableDenormValue =
getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
Fma4.getValue(1), DisableDenormValue,
- Fma4.getValue(2));
+ Fma4.getValue(2)).getNode();
} else {
const SDValue DisableDenormValue =
DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
- DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
- Fma4.getValue(1), DisableDenormValue,
- BitField, Fma4.getValue(2));
+ DisableDenorm = DAG.getMachineNode(
+ AMDGPU::S_SETREG_B32, SL, MVT::Other,
+ {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
}
SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
- DisableDenorm, DAG.getRoot());
+ SDValue(DisableDenorm, 0), DAG.getRoot());
DAG.setRoot(OutputChain);
}
SDValue Scale = NumeratorScaled.getValue(1);
SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
- Fma4, Fma1, Fma3, Scale);
+ {Fma4, Fma1, Fma3, Scale}, Flags);
- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
}
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
@@ -7916,7 +8384,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If there is a possibilty that flat instruction access scratch memory
// then we need to use the same legalization rules we use for private.
- if (AS == AMDGPUAS::FLAT_ADDRESS)
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
+ !Subtarget->hasMultiDwordFlatScratchAddressing())
AS = MFI->hasFlatScratchInit() ?
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
@@ -7976,22 +8445,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDValue Arg = Op.getOperand(0);
SDValue TrigVal;
- // TODO: Should this propagate fast-math-flags?
+ // Propagate fast-math flags so that the multiply we introduce can be folded
+ // if Arg is already the result of a multiply by constant.
+ auto Flags = Op->getFlags();
- SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
+ SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
if (Subtarget->hasTrigReducedRange()) {
- SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
- TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
+ SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
+ TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
} else {
- TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+ TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
}
switch (Op.getOpcode()) {
case ISD::FCOS:
- return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
+ return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
case ISD::FSIN:
- return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
+ return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
default:
llvm_unreachable("Wrong trig opcode");
}
@@ -8032,7 +8503,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
EVT ScalarVT = VT.getScalarType();
- if (ScalarVT != MVT::f32)
+ if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -8047,8 +8518,14 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
// about in practice.
if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
- SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+ SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
DCI.AddToWorklist(Cvt.getNode());
+
+ // For the f16 case, fold to a cast to f32 and then cast back to f16.
+ if (ScalarVT != MVT::f32) {
+ Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
+ DAG.getTargetConstant(0, DL, MVT::i32));
+ }
return Cvt;
}
}
@@ -8525,7 +9002,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}
- if (VT != MVT::i64)
+ if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
return SDValue();
// TODO: This could be a generic combine with a predicate for extracting the
@@ -8735,6 +9212,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
N->getFlags());
}
+ if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+ return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
+ N0.getOperand(0), N->getFlags());
+ }
+
return AMDGPUTargetLowering::performRcpCombine(N, DCI);
}
@@ -8776,9 +9258,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case AMDGPUISD::RSQ:
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::RCP_LEGACY:
- case AMDGPUISD::RSQ_LEGACY:
case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::TRIG_PREOP:
case AMDGPUISD::DIV_SCALE:
case AMDGPUISD::DIV_FMAS:
case AMDGPUISD::DIV_FIXUP:
@@ -8881,6 +9361,12 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_fdot2:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_trig_preop:
return true;
default:
break;
@@ -9099,8 +9585,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return SDValue();
// Ordered >= (although NaN inputs should have folded away by now).
- APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
- if (Cmp == APFloat::cmpGreaterThan)
+ if (K0->getValueAPF() > K1->getValueAPF())
return SDValue();
const MachineFunction &MF = DAG.getMachineFunction();
@@ -9275,6 +9760,50 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
return SDValue();
}
+// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+// expanded into a set of cmp/select instructions.
+bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
+ unsigned NumElem,
+ bool IsDivergentIdx) {
+ if (UseDivergentRegisterIndexing)
+ return false;
+
+ unsigned VecSize = EltSize * NumElem;
+
+ // Sub-dword vectors of size 2 dword or less have better implementation.
+ if (VecSize <= 64 && EltSize < 32)
+ return false;
+
+ // Always expand the rest of sub-dword instructions, otherwise it will be
+ // lowered via memory.
+ if (EltSize < 32)
+ return true;
+
+ // Always do this if var-idx is divergent, otherwise it will become a loop.
+ if (IsDivergentIdx)
+ return true;
+
+ // Large vectors would yield too many compares and v_cndmask_b32 instructions.
+ unsigned NumInsts = NumElem /* Number of compares */ +
+ ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
+ return NumInsts <= 16;
+}
+
+static bool shouldExpandVectorDynExt(SDNode *N) {
+ SDValue Idx = N->getOperand(N->getNumOperands() - 1);
+ if (isa<ConstantSDNode>(Idx))
+ return false;
+
+ SDValue Vec = N->getOperand(0);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned EltSize = EltVT.getSizeInBits();
+ unsigned NumElem = VecVT.getVectorNumElements();
+
+ return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+ Idx->isDivergent());
+}
+
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
@@ -9336,18 +9865,12 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
unsigned EltSize = EltVT.getSizeInBits();
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
- // This elminates non-constant index and subsequent movrel or scratch access.
- // Sub-dword vectors of size 2 dword or less have better implementation.
- // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
- // instructions.
- if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
- !isa<ConstantSDNode>(N->getOperand(1))) {
+ if (::shouldExpandVectorDynExt(N)) {
SDLoc SL(N);
SDValue Idx = N->getOperand(1);
- EVT IdxVT = Idx.getValueType();
SDValue V;
for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
- SDValue IC = DAG.getConstant(I, SL, IdxVT);
+ SDValue IC = DAG.getVectorIdxConstant(I, SL);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
if (I == 0)
V = Elt;
@@ -9402,17 +9925,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
SDValue Idx = N->getOperand(2);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
- unsigned VecSize = VecVT.getSizeInBits();
- unsigned EltSize = EltVT.getSizeInBits();
// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)
- // This elminates non-constant index and subsequent movrel or scratch access.
- // Sub-dword vectors of size 2 dword or less have better implementation.
- // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
- // instructions.
- if (isa<ConstantSDNode>(Idx) ||
- VecSize > 256 || (VecSize <= 64 && EltSize < 32))
+ if (!::shouldExpandVectorDynExt(N))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -9919,39 +10435,50 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
SDValue Src = N->getOperand(0);
- SDValue Srl = N->getOperand(0);
- if (Srl.getOpcode() == ISD::ZERO_EXTEND)
- Srl = Srl.getOperand(0);
+ SDValue Shift = N->getOperand(0);
- // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
- if (Srl.getOpcode() == ISD::SRL) {
+ // TODO: Extend type shouldn't matter (assuming legal types).
+ if (Shift.getOpcode() == ISD::ZERO_EXTEND)
+ Shift = Shift.getOperand(0);
+
+ if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
+ // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
+ // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
- // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
-
- if (const ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
- Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
- EVT(MVT::i32));
+ // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+ if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
+ Shift = DAG.getZExtOrTrunc(Shift.getOperand(0),
+ SDLoc(Shift.getOperand(0)), MVT::i32);
+
+ unsigned ShiftOffset = 8 * Offset;
+ if (Shift.getOpcode() == ISD::SHL)
+ ShiftOffset -= C->getZExtValue();
+ else
+ ShiftOffset += C->getZExtValue();
- unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
- if (SrcOffset < 32 && SrcOffset % 8 == 0) {
- return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
- MVT::f32, Srl);
+ if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
+ MVT::f32, Shift);
}
}
}
- APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
-
- KnownBits Known;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
- DCI.CommitTargetLoweringOpt(TLO);
+ APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+ if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
+ // We simplified Src. If this node is not dead, visit it again so it is
+ // folded properly.
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
}
+ // Handle (or x, (srl y, 8)) pattern when known bits are zero.
+ if (SDValue DemandedSrc =
+ TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
+ return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
+
return SDValue();
}
@@ -9964,16 +10491,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
const MachineFunction &MF = DCI.DAG.getMachineFunction();
const APFloat &F = CSrc->getValueAPF();
APFloat Zero = APFloat::getZero(F.getSemantics());
- APFloat::cmpResult Cmp0 = F.compare(Zero);
- if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered &&
- MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
+ if (F < Zero ||
+ (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
}
APFloat One(F.getSemantics(), "1.0");
- APFloat::cmpResult Cmp1 = F.compare(One);
- if (Cmp1 == APFloat::cmpGreaterThan)
+ if (F > One)
return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
return SDValue(CSrc, 0);
@@ -10061,10 +10585,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
- case AMDGPUISD::RSQ_LEGACY:
case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::LDEXP: {
+ // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(0);
if (Src.isUndef())
return Src;
@@ -10406,24 +10930,6 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
- case AMDGPU::V_PERMLANE16_B32:
- case AMDGPU::V_PERMLANEX16_B32: {
- ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
- ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
- if (!FI->getZExtValue() && !BC->getZExtValue())
- break;
- SDValue VDstIn = Node->getOperand(6);
- if (VDstIn.isMachineOpcode()
- && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
- break;
- MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
- SDLoc(Node), MVT::i32);
- SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
- SDValue(BC, 0), Node->getOperand(3),
- Node->getOperand(4), Node->getOperand(5),
- SDValue(ImpDef, 0), Node->getOperand(7) };
- return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
- }
default:
break;
}
@@ -10592,89 +11098,50 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
MVT VT) const {
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
+ const unsigned BitWidth = VT.getSizeInBits();
switch (Constraint[0]) {
default:
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
case 's':
case 'r':
- switch (VT.getSizeInBits()) {
- default:
- return std::make_pair(0U, nullptr);
- case 32:
+ switch (BitWidth) {
case 16:
RC = &AMDGPU::SReg_32RegClass;
break;
case 64:
RC = &AMDGPU::SGPR_64RegClass;
break;
- case 96:
- RC = &AMDGPU::SReg_96RegClass;
- break;
- case 128:
- RC = &AMDGPU::SGPR_128RegClass;
- break;
- case 160:
- RC = &AMDGPU::SReg_160RegClass;
- break;
- case 256:
- RC = &AMDGPU::SReg_256RegClass;
- break;
- case 512:
- RC = &AMDGPU::SReg_512RegClass;
+ default:
+ RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
+ if (!RC)
+ return std::make_pair(0U, nullptr);
break;
}
break;
case 'v':
- switch (VT.getSizeInBits()) {
- default:
- return std::make_pair(0U, nullptr);
- case 32:
+ switch (BitWidth) {
case 16:
RC = &AMDGPU::VGPR_32RegClass;
break;
- case 64:
- RC = &AMDGPU::VReg_64RegClass;
- break;
- case 96:
- RC = &AMDGPU::VReg_96RegClass;
- break;
- case 128:
- RC = &AMDGPU::VReg_128RegClass;
- break;
- case 160:
- RC = &AMDGPU::VReg_160RegClass;
- break;
- case 256:
- RC = &AMDGPU::VReg_256RegClass;
- break;
- case 512:
- RC = &AMDGPU::VReg_512RegClass;
+ default:
+ RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
+ if (!RC)
+ return std::make_pair(0U, nullptr);
break;
}
break;
case 'a':
if (!Subtarget->hasMAIInsts())
break;
- switch (VT.getSizeInBits()) {
- default:
- return std::make_pair(0U, nullptr);
- case 32:
+ switch (BitWidth) {
case 16:
RC = &AMDGPU::AGPR_32RegClass;
break;
- case 64:
- RC = &AMDGPU::AReg_64RegClass;
- break;
- case 128:
- RC = &AMDGPU::AReg_128RegClass;
- break;
- case 512:
- RC = &AMDGPU::AReg_512RegClass;
+ default:
+ RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
+ if (!RC)
+ return std::make_pair(0U, nullptr);
break;
- case 1024:
- RC = &AMDGPU::AReg_1024RegClass;
- // v32 types are not legal but we support them here.
- return std::make_pair(0U, RC);
}
break;
}
@@ -10701,9 +11168,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(RC->getRegister(Idx), RC);
}
}
+
+ // FIXME: Returns VS_32 for physical SGPR constraints
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+static bool isImmConstraint(StringRef Constraint) {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 'I':
+ case 'J':
+ case 'A':
+ case 'B':
+ case 'C':
+ return true;
+ }
+ } else if (Constraint == "DA" ||
+ Constraint == "DB") {
+ return true;
+ }
+ return false;
+}
+
SITargetLowering::ConstraintType
SITargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
@@ -10715,9 +11202,115 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
return C_RegisterClass;
}
}
+ if (isImmConstraint(Constraint)) {
+ return C_Other;
+ }
return TargetLowering::getConstraintType(Constraint);
}
+static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
+ if (!AMDGPU::isInlinableIntLiteral(Val)) {
+ Val = Val & maskTrailingOnes<uint64_t>(Size);
+ }
+ return Val;
+}
+
+void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ if (isImmConstraint(Constraint)) {
+ uint64_t Val;
+ if (getAsmOperandConstVal(Op, Val) &&
+ checkAsmConstraintVal(Op, Constraint, Val)) {
+ Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
+ Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
+ }
+ } else {
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+ }
+}
+
+bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
+ unsigned Size = Op.getScalarValueSizeInBits();
+ if (Size > 64)
+ return false;
+
+ if (Size == 16 && !Subtarget->has16BitInsts())
+ return false;
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ Val = C->getSExtValue();
+ return true;
+ }
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
+ Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ return true;
+ }
+ if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
+ if (Size != 16 || Op.getNumOperands() != 2)
+ return false;
+ if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
+ return false;
+ if (ConstantSDNode *C = V->getConstantSplatNode()) {
+ Val = C->getSExtValue();
+ return true;
+ }
+ if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
+ Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
+ const std::string &Constraint,
+ uint64_t Val) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'I':
+ return AMDGPU::isInlinableIntLiteral(Val);
+ case 'J':
+ return isInt<16>(Val);
+ case 'A':
+ return checkAsmConstraintValA(Op, Val);
+ case 'B':
+ return isInt<32>(Val);
+ case 'C':
+ return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
+ AMDGPU::isInlinableIntLiteral(Val);
+ default:
+ break;
+ }
+ } else if (Constraint.size() == 2) {
+ if (Constraint == "DA") {
+ int64_t HiBits = static_cast<int32_t>(Val >> 32);
+ int64_t LoBits = static_cast<int32_t>(Val);
+ return checkAsmConstraintValA(Op, HiBits, 32) &&
+ checkAsmConstraintValA(Op, LoBits, 32);
+ }
+ if (Constraint == "DB") {
+ return true;
+ }
+ }
+ llvm_unreachable("Invalid asm constraint");
+}
+
+bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
+ uint64_t Val,
+ unsigned MaxSize) const {
+ unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
+ bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
+ if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
+ (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
+ (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
+ return true;
+ }
+ return false;
+}
+
// Figure out which registers should be reserved for stack access. Only after
// the function is legalized do we know all of the non-spill stack objects or if
// calls are present.
@@ -10745,11 +11338,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
- if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
- MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
- Info->getScratchWaveOffsetReg());
- }
-
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
@@ -10772,15 +11360,18 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
TargetLoweringBase::finalizeLowering(MF);
+
+ // Allocate a VGPR for future SGPR Spill if
+ // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
+ // FIXME: We won't need this hack if we split SGPR allocation from VGPR
+ if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&
+ !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())
+ Info->reserveVGPRforSGPRSpills(MF);
}
-void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
- KnownBits &Known,
- const APInt &DemandedElts,
- const SelectionDAG &DAG,
- unsigned Depth) const {
- TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
- DAG, Depth);
+void SITargetLowering::computeKnownBitsForFrameIndex(
+ const int FI, KnownBits &Known, const MachineFunction &MF) const {
+ TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
// Set the high bits to zero based on the maximum allowed scratch size per
// wave. We can't use vaddr in MUBUF instructions if we don't know the address
@@ -10788,6 +11379,27 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
}
+Align SITargetLowering::computeKnownAlignForTargetInstr(
+ GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
+ unsigned Depth) const {
+ const MachineInstr *MI = MRI.getVRegDef(R);
+ switch (MI->getOpcode()) {
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ // FIXME: Can this move to generic code? What about the case where the call
+ // site specifies a lower alignment?
+ Intrinsic::ID IID = MI->getIntrinsicID();
+ LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
+ AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
+ if (MaybeAlign RetAlign = Attrs.getRetAlignment())
+ return *RetAlign;
+ return Align(1);
+ }
+ default:
+ return Align(1);
+ }
+}
+
Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
const Align CacheLineAlign = Align(64);
@@ -10879,30 +11491,19 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
case ISD::CopyFromReg:
{
const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
- const MachineFunction * MF = FLI->MF;
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const MachineRegisterInfo &MRI = MF->getRegInfo();
- const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
- unsigned Reg = R->getReg();
- if (Register::isPhysicalRegister(Reg))
- return !TRI.isSGPRReg(MRI, Reg);
-
- if (MRI.isLiveIn(Reg)) {
- // workitem.id.x workitem.id.y workitem.id.z
- // Any VGPR formal argument is also considered divergent
- if (!TRI.isSGPRReg(MRI, Reg))
- return true;
- // Formal arguments of non-entry functions
- // are conservatively considered divergent
- else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
- return true;
- return false;
- }
- const Value *V = FLI->getValueFromVirtualReg(Reg);
- if (V)
+ const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ Register Reg = R->getReg();
+
+ // FIXME: Why does this need to consider isLiveIn?
+ if (Reg.isPhysical() || MRI.isLiveIn(Reg))
+ return !TRI->isSGPRReg(MRI, Reg);
+
+ if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
return KDA->isDivergent(V);
+
assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
- return !TRI.isSGPRReg(MRI, Reg);
+ return !TRI->isSGPRReg(MRI, Reg);
}
break;
case ISD::LOAD: {
@@ -11004,7 +11605,19 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
return RC;
}
-static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+// FIXME: This is a workaround for DivergenceAnalysis not understanding always
+// uniform values (as produced by the mask results of control flow intrinsics)
+// used outside of divergent blocks. The phi users need to also be treated as
+// always uniform.
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
+ unsigned WaveSize) {
+ // FIXME: We asssume we never cast the mask results of a control flow
+ // intrinsic.
+ // Early exit if the type won't be consistent as a compile time hack.
+ IntegerType *IT = dyn_cast<IntegerType>(V->getType());
+ if (!IT || IT->getBitWidth() != WaveSize)
+ return false;
+
if (!isa<Instruction>(V))
return false;
if (!Visited.insert(V).second)
@@ -11036,7 +11649,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
}
}
} else {
- Result = hasCFUser(U, Visited);
+ Result = hasCFUser(U, Visited, WaveSize);
}
if (Result)
break;
@@ -11046,36 +11659,16 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
const Value *V) const {
- if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
- switch (Intrinsic->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::amdgcn_if_break:
- return true;
- }
- }
- if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
- if (const IntrinsicInst *Intrinsic =
- dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
- switch (Intrinsic->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::amdgcn_if:
- case Intrinsic::amdgcn_else: {
- ArrayRef<unsigned> Indices = ExtValue->getIndices();
- if (Indices.size() == 1 && Indices[0] == 1) {
- return true;
- }
- }
- }
- }
- }
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
- if (isa<InlineAsm>(CI->getCalledValue())) {
+ if (CI->isInlineAsm()) {
+ // FIXME: This cannot give a correct answer. This should only trigger in
+ // the case where inline asm returns mixed SGPR and VGPR results, used
+ // outside the defining block. We don't have a specific result to
+ // consider, so this assumes if any value is SGPR, the overall register
+ // also needs to be SGPR.
const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
- ImmutableCallSite CS(CI);
TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
- MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+ MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
for (auto &TC : TargetConstraints) {
if (TC.Type == InlineAsm::isOutput) {
ComputeConstraintToUse(TC, SDValue());
@@ -11095,5 +11688,20 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
}
}
SmallPtrSet<const Value *, 16> Visited;
- return hasCFUser(V, Visited);
+ return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
+}
+
+std::pair<int, MVT>
+SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
+ Type *Ty) const {
+ auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
+ auto Size = DL.getTypeSizeInBits(Ty);
+ // Maximum load or store can handle 8 dwords for scalar and 4 for
+ // vector ALU. Let's assume anything above 8 dwords is expensive
+ // even if legal.
+ if (Size <= 256)
+ return Cost;
+
+ Cost.first = (Size + 255) / 256;
+ return Cost;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d59495b052a4f..f4c0764640575 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -42,7 +42,8 @@ private:
SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- uint64_t Offset, unsigned Align, bool Signed,
+ uint64_t Offset, Align Alignment,
+ bool Signed,
const ISD::InputArg *Arg = nullptr) const;
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
@@ -60,7 +61,7 @@ private:
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG) const;
SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
- SDValue GLC, SDValue DLC, SelectionDAG &DAG) const;
+ SDValue CachePolicy, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -107,7 +108,7 @@ private:
/// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.
- SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
+ SDValue getFPExtOrFPRound(SelectionDAG &DAG,
SDValue Op,
const SDLoc &DL,
EVT VT) const;
@@ -119,6 +120,7 @@ private:
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;
@@ -199,6 +201,15 @@ public:
/// global value \p GV, false otherwise.
bool shouldEmitPCReloc(const GlobalValue *GV) const;
+ /// \returns true if this should use a literal constant for an LDS address,
+ /// and not emit a relocation for an LDS global.
+ bool shouldUseLDSConstAddress(const GlobalValue *GV) const;
+
+ /// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+ /// expanded into a set of cmp/select instructions.
+ static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
+ bool IsDivergentIdx);
+
private:
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
@@ -206,7 +217,7 @@ private:
/// \returns 0 If there is a non-constant offset or if the offset is 0.
/// Otherwise returns the constant offset.
unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets, unsigned Align = 4) const;
+ SDValue *Offsets, Align Alignment = Align(4)) const;
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -253,15 +264,18 @@ public:
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
bool *IsFast = nullptr) const override;
- EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
- unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset,
- bool MemcpyStrSrc,
+ EVT getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
bool isMemOpUniform(const SDNode *N) const;
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
+ static bool isNonGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
+ AS == AMDGPUAS::PRIVATE_ADDRESS;
+ }
+
+ // FIXME: Missing constant_32bit
static bool isFlatGlobalAddrSpace(unsigned AS) {
return AS == AMDGPUAS::GLOBAL_ADDRESS ||
AS == AMDGPUAS::FLAT_ADDRESS ||
@@ -330,6 +344,9 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
@@ -351,8 +368,7 @@ public:
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
- bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
- const SDNode *N) const override;
+ bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
@@ -377,17 +393,29 @@ public:
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
+ void LowerAsmOperandForConstraint(SDValue Op,
+ std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+ bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
+ bool checkAsmConstraintVal(SDValue Op,
+ const std::string &Constraint,
+ uint64_t Val) const;
+ bool checkAsmConstraintValA(SDValue Op,
+ uint64_t Val,
+ unsigned MaxSize = 64) const;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
SDValue V) const;
void finalizeLowering(MachineFunction &MF) const override;
- void computeKnownBitsForFrameIndex(const SDValue Op,
+ void computeKnownBitsForFrameIndex(int FrameIdx,
KnownBits &Known,
- const APInt &DemandedElts,
- const SelectionDAG &DAG,
- unsigned Depth = 0) const override;
+ const MachineFunction &MF) const override;
+ Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R,
+ const MachineRegisterInfo &MRI,
+ unsigned Depth = 0) const override;
bool isSDNodeSourceOfDivergence(const SDNode *N,
FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
@@ -432,6 +460,13 @@ public:
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
+ void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
+ Type *Ty) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
new file mode 100644
index 0000000000000..35c49ae8c0dd1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -0,0 +1,203 @@
+//===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_clause instructions to form hard clauses.
+///
+/// Clausing load instructions can give cache coherency benefits. Before gfx10,
+/// the hardware automatically detected "soft clauses", which were sequences of
+/// memory instructions of the same type. In gfx10 this detection was removed,
+/// and the s_clause instruction was introduced to explicitly mark "hard
+/// clauses".
+///
+/// It's the scheduler's job to form the clauses by putting similar memory
+/// instructions next to each other. Our job is just to insert an s_clause
+/// instruction to mark the start of each clause.
+///
+/// Note that hard clauses are very similar to, but logically distinct from, the
+/// groups of instructions that have to be restartable when XNACK is enabled.
+/// The rules are slightly different in each case. For example an s_nop
+/// instruction breaks a restartable group, but can appear in the middle of a
+/// hard clause. (Before gfx10 there wasn't a distinction, and both were called
+/// "soft clauses" or just "clauses".)
+///
+/// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable
+/// groups, not hard clauses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-hard-clauses"
+
+namespace {
+
+enum HardClauseType {
+ // Texture, buffer, global or scratch memory instructions.
+ HARDCLAUSE_VMEM,
+ // Flat (not global or scratch) memory instructions.
+ HARDCLAUSE_FLAT,
+ // Instructions that access LDS.
+ HARDCLAUSE_LDS,
+ // Scalar memory instructions.
+ HARDCLAUSE_SMEM,
+ // VALU instructions.
+ HARDCLAUSE_VALU,
+ LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU,
+
+ // Internal instructions, which are allowed in the middle of a hard clause,
+ // except for s_waitcnt.
+ HARDCLAUSE_INTERNAL,
+ // Instructions that are not allowed in a hard clause: SALU, export, branch,
+ // message, GDS, s_waitcnt and anything else not mentioned above.
+ HARDCLAUSE_ILLEGAL,
+};
+
+HardClauseType getHardClauseType(const MachineInstr &MI) {
+ // On current architectures we only get a benefit from clausing loads.
+ if (MI.mayLoad()) {
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
+ return HARDCLAUSE_VMEM;
+ if (SIInstrInfo::isFLAT(MI))
+ return HARDCLAUSE_FLAT;
+ // TODO: LDS
+ if (SIInstrInfo::isSMRD(MI))
+ return HARDCLAUSE_SMEM;
+ }
+
+ // Don't form VALU clauses. It's not clear what benefit they give, if any.
+
+ // In practice s_nop is the only internal instruction we're likely to see.
+ // It's safe to treat the rest as illegal.
+ if (MI.getOpcode() == AMDGPU::S_NOP)
+ return HARDCLAUSE_INTERNAL;
+ return HARDCLAUSE_ILLEGAL;
+}
+
+class SIInsertHardClauses : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SIInsertHardClauses() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ // Track information about a clause as we discover it.
+ struct ClauseInfo {
+ // The type of all (non-internal) instructions in the clause.
+ HardClauseType Type = HARDCLAUSE_ILLEGAL;
+ // The first (necessarily non-internal) instruction in the clause.
+ MachineInstr *First = nullptr;
+ // The last non-internal instruction in the clause.
+ MachineInstr *Last = nullptr;
+ // The length of the clause including any internal instructions in the
+ // middle or after the end of the clause.
+ unsigned Length = 0;
+ // The base operands of *Last.
+ SmallVector<const MachineOperand *, 4> BaseOps;
+ };
+
+ bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
+ // Get the size of the clause excluding any internal instructions at the
+ // end.
+ unsigned Size =
+ std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1;
+ if (Size < 2)
+ return false;
+ assert(Size <= 64 && "Hard clause is too long!");
+
+ auto &MBB = *CI.First->getParent();
+ auto ClauseMI =
+ BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
+ .addImm(Size - 1);
+ finalizeBundle(MBB, ClauseMI->getIterator(),
+ std::next(CI.Last->getIterator()));
+ return true;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasHardClauses())
+ return false;
+
+ const SIInstrInfo *SII = ST.getInstrInfo();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+ for (auto &MBB : MF) {
+ ClauseInfo CI;
+ for (auto &MI : MBB) {
+ HardClauseType Type = getHardClauseType(MI);
+
+ int64_t Dummy1;
+ bool Dummy2;
+ unsigned Dummy3;
+ SmallVector<const MachineOperand *, 4> BaseOps;
+ if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
+ if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2,
+ Dummy3, TRI)) {
+ // We failed to get the base operands, so we'll never clause this
+ // instruction with any other, so pretend it's illegal.
+ Type = HARDCLAUSE_ILLEGAL;
+ }
+ }
+
+ if (CI.Length == 64 ||
+ (CI.Length && Type != HARDCLAUSE_INTERNAL &&
+ (Type != CI.Type ||
+ // Note that we lie to shouldClusterMemOps about the size of the
+ // cluster. When shouldClusterMemOps is called from the machine
+ // scheduler it limits the size of the cluster to avoid increasing
+ // register pressure too much, but this pass runs after register
+ // allocation so there is no need for that kind of limit.
+ !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+ // Finish the current clause.
+ Changed |= emitClause(CI, SII);
+ CI = ClauseInfo();
+ }
+
+ if (CI.Length) {
+ // Extend the current clause.
+ ++CI.Length;
+ if (Type != HARDCLAUSE_INTERNAL) {
+ CI.Last = &MI;
+ CI.BaseOps = std::move(BaseOps);
+ }
+ } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
+ // Start a new clause.
+ CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)};
+ }
+ }
+
+ // Finish the last clause in the basic block if any.
+ if (CI.Length)
+ Changed |= emitClause(CI, SII);
+ }
+
+ return Changed;
+ }
+};
+
+} // namespace
+
+char SIInsertHardClauses::ID = 0;
+
+char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID;
+
+INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses",
+ false, false)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 80c044ec00cb3..052db5f6ea718 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -18,9 +18,11 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -28,6 +30,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -52,21 +55,22 @@ private:
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
unsigned SkipThreshold = 0;
+ MachineDominatorTree *MDT = nullptr;
+
+ MachineBasicBlock *EarlyExitBlock = nullptr;
bool shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
- bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
-
- void kill(MachineInstr &MI);
+ bool dominatesAllReachable(MachineBasicBlock &MBB);
+ void createEarlyExitBlock(MachineBasicBlock &MBB);
+ void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ DebugLoc DL);
- MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
+ bool kill(MachineInstr &MI);
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
- bool optimizeVccBranch(MachineInstr &MI) const;
-
public:
static char ID;
@@ -79,6 +83,8 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -87,8 +93,11 @@ public:
char SIInsertSkips::ID = 0;
-INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
- "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
@@ -146,42 +155,110 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
return false;
}
-bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
- MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction *MF = MBB.getParent();
-
- if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
- !shouldSkip(MBB, MBB.getParent()->back()))
- return false;
+/// Check whether \p MBB dominates all blocks that are reachable from it.
+bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
+ for (MachineBasicBlock *Other : depth_first(&MBB)) {
+ if (!MDT->dominates(&MBB, Other))
+ return false;
+ }
+ return true;
+}
- MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+static void generatePsEndPgm(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL,
+ const SIInstrInfo *TII) {
+ // Generate "null export; s_endpgm".
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+}
- const DebugLoc &DL = MI.getDebugLoc();
+void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
+ MachineFunction *MF = MBB.getParent();
+ DebugLoc DL;
- // If the exec mask is non-zero, skip the next two instructions
- BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&NextBB);
+ assert(!EarlyExitBlock);
+ EarlyExitBlock = MF->CreateMachineBasicBlock();
+ MF->insert(MF->end(), EarlyExitBlock);
- MachineBasicBlock::iterator Insert = SkipBB->begin();
+ generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
+}
- // Exec mask is zero: Export to NULL target...
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addImm(1) // vm
- .addImm(0) // compr
- .addImm(0); // en
+/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
+/// iterator. Only applies to pixel shaders.
+void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, DebugLoc DL) {
+ MachineFunction *MF = MBB.getParent();
+ assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
+
+ // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
+ // basic block that has no further successors (e.g., there was an
+ // `unreachable` there in IR). This can happen with original source of the
+ // form:
+ //
+ // if (uniform_condition) {
+ // write_to_memory();
+ // discard;
+ // }
+ //
+ // In this case, we write the "null_export; s_endpgm" skip code in the
+ // already-existing basic block.
+ auto NextBBI = std::next(MBB.getIterator());
+ bool NoSuccessor = I == MBB.end() &&
+ llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
+
+ if (NoSuccessor) {
+ generatePsEndPgm(MBB, I, DL, TII);
+ } else {
+ if (!EarlyExitBlock) {
+ createEarlyExitBlock(MBB);
+ // Update next block pointer to reflect any new blocks
+ NextBBI = std::next(MBB.getIterator());
+ }
- // ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+ auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ .addMBB(EarlyExitBlock);
+
+ // Split the block if the branch will not come at the end.
+ auto Next = std::next(BranchMI->getIterator());
+ if (Next != MBB.end() && !Next->isTerminator()) {
+ MachineBasicBlock *SplitBB =
+ MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ MF->insert(NextBBI, SplitBB);
+ SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
+ SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ // FIXME: the expectation is that this will be used near the beginning
+ // of a block so just assume all registers are still live.
+ for (auto LiveIn : MBB.liveins())
+ SplitBB->addLiveIn(LiveIn);
+ MBB.addSuccessor(SplitBB);
+
+ // Update dominator tree
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+ MDT->getBase().applyUpdates(DTUpdates);
+ }
- return true;
+ MBB.addSuccessor(EarlyExitBlock);
+ MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+ }
}
-void SIInsertSkips::kill(MachineInstr &MI) {
+/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
+/// Return true unless the terminator is a no-op.
+bool SIInsertSkips::kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
@@ -268,7 +345,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
I.addImm(0); // omod
}
- break;
+ return true;
}
case AMDGPU::SI_KILL_I1_TERMINATOR: {
const MachineFunction *MF = MI.getParent()->getParent();
@@ -283,11 +360,13 @@ void SIInsertSkips::kill(MachineInstr &MI) {
int64_t Imm = Op.getImm();
assert(Imm == 0 || Imm == -1);
- if (Imm == KillVal)
+ if (Imm == KillVal) {
BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64), Exec)
.addImm(0);
- break;
+ return true;
+ }
+ return false;
}
unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
@@ -296,27 +375,13 @@ void SIInsertSkips::kill(MachineInstr &MI) {
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
.addReg(Exec)
.add(Op);
- break;
+ return true;
}
default:
llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
}
}
-MachineBasicBlock *SIInsertSkips::insertSkipBlock(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
- MachineFunction *MF = MBB.getParent();
-
- MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, SkipBB);
- MBB.addSuccessor(SkipBB);
-
- return SkipBB;
-}
-
// Returns true if a branch over the block was inserted.
bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {
@@ -334,143 +399,24 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
return true;
}
-bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
- // Match:
- // sreg = -1
- // vcc = S_AND_B64 exec, sreg
- // S_CBRANCH_VCC[N]Z
- // =>
- // S_CBRANCH_EXEC[N]Z
- bool Changed = false;
- MachineBasicBlock &MBB = *MI.getParent();
- const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
- const bool IsWave32 = ST.isWave32();
- const unsigned CondReg = TRI->getVCC();
- const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-
- MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
- E = MBB.rend();
- bool ReadsCond = false;
- unsigned Threshold = 5;
- for (++A ; A != E ; ++A) {
- if (!--Threshold)
- return false;
- if (A->modifiesRegister(ExecReg, TRI))
- return false;
- if (A->modifiesRegister(CondReg, TRI)) {
- if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
- return false;
- break;
- }
- ReadsCond |= A->readsRegister(CondReg, TRI);
- }
- if (A == E)
- return false;
-
- MachineOperand &Op1 = A->getOperand(1);
- MachineOperand &Op2 = A->getOperand(2);
- if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
- TII->commuteInstruction(*A);
- Changed = true;
- }
- if (Op1.getReg() != ExecReg)
- return Changed;
- if (Op2.isImm() && Op2.getImm() != -1)
- return Changed;
-
- unsigned SReg = AMDGPU::NoRegister;
- if (Op2.isReg()) {
- SReg = Op2.getReg();
- auto M = std::next(A);
- bool ReadsSreg = false;
- for ( ; M != E ; ++M) {
- if (M->definesRegister(SReg, TRI))
- break;
- if (M->modifiesRegister(SReg, TRI))
- return Changed;
- ReadsSreg |= M->readsRegister(SReg, TRI);
- }
- if (M == E ||
- !M->isMoveImmediate() ||
- !M->getOperand(1).isImm() ||
- M->getOperand(1).getImm() != -1)
- return Changed;
- // First if sreg is only used in and instruction fold the immediate
- // into that and.
- if (!ReadsSreg && Op2.isKill()) {
- A->getOperand(2).ChangeToImmediate(-1);
- M->eraseFromParent();
- }
- }
-
- if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
- MI.killsRegister(CondReg, TRI))
- A->eraseFromParent();
-
- bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
- if (SReg == ExecReg) {
- if (IsVCCZ) {
- MI.eraseFromParent();
- return true;
- }
- MI.setDesc(TII->get(AMDGPU::S_BRANCH));
- } else {
- MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
- : AMDGPU::S_CBRANCH_EXECNZ));
- }
-
- MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
- MI.addImplicitDefUseOperands(*MBB.getParent());
-
- return true;
-}
-
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
SkipThreshold = SkipThresholdFlag;
- bool HaveKill = false;
+ SmallVector<MachineInstr *, 4> KillInstrs;
bool MadeChange = false;
- // Track depth of exec mask, divergent branches.
- SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
-
- MachineFunction::iterator NextBB;
-
- MachineBasicBlock *EmptyMBBAtEnd = nullptr;
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; BI = NextBB) {
- NextBB = std::next(BI);
- MachineBasicBlock &MBB = *BI;
- bool HaveSkipBlock = false;
-
- if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
- // Reached convergence point for last divergent branch.
- ExecBranchStack.pop_back();
- }
-
- if (HaveKill && ExecBranchStack.empty()) {
- HaveKill = false;
-
- // TODO: Insert skip if exec is 0?
- }
-
+ for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
-
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
- case AMDGPU::S_CBRANCH_EXECZ:
- ExecBranchStack.push_back(MI.getOperand(0).getMBB());
- break;
case AMDGPU::SI_MASK_BRANCH:
- ExecBranchStack.push_back(MI.getOperand(0).getMBB());
MadeChange |= skipMaskBranch(MI, MBB);
break;
@@ -478,64 +424,60 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
// Optimize out branches to the next block.
// FIXME: Shouldn't this be handled by BranchFolding?
if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+ assert(&MI == &MBB.back());
MI.eraseFromParent();
- } else if (HaveSkipBlock) {
- // Remove the given unconditional branch when a skip block has been
- // inserted after the current one and let skip the two instructions
- // performing the kill if the exec mask is non-zero.
- MI.eraseFromParent();
+ MadeChange = true;
}
break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- case AMDGPU::SI_KILL_I1_TERMINATOR:
+ case AMDGPU::SI_KILL_I1_TERMINATOR: {
MadeChange = true;
- kill(MI);
-
- if (ExecBranchStack.empty()) {
- if (NextBB != BE && skipIfDead(MI, *NextBB)) {
- HaveSkipBlock = true;
- NextBB = std::next(BI);
- BE = MF.end();
- }
+ bool CanKill = kill(MI);
+
+ // Check if we can add an early "if exec=0 { end shader }".
+ //
+ // Note that we _always_ do this if it is correct, even if the kill
+ // happens fairly late in the shader, because the null export should
+ // generally still be cheaper than normal export(s).
+ //
+ // TODO: The dominatesAllReachable check is conservative: if the
+ // dominance is only missing due to _uniform_ branches, we could
+ // in fact insert the early-exit as well.
+ if (CanKill &&
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+ dominatesAllReachable(MBB)) {
+ // Mark the instruction for kill-if-dead insertion. We delay this
+ // change because it modifies the CFG.
+ KillInstrs.push_back(&MI);
} else {
- HaveKill = true;
+ MI.eraseFromParent();
}
-
- MI.eraseFromParent();
break;
+ }
- case AMDGPU::SI_RETURN_TO_EPILOG:
- // FIXME: Should move somewhere else
- assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
- // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
- // because external bytecode will be appended at the end.
- if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
- // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
- // the end and jump there.
- if (!EmptyMBBAtEnd) {
- EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
- MF.insert(MF.end(), EmptyMBBAtEnd);
- }
-
- MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(EmptyMBBAtEnd);
- I->eraseFromParent();
+ case AMDGPU::SI_KILL_CLEANUP:
+ if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+ dominatesAllReachable(MBB)) {
+ KillInstrs.push_back(&MI);
+ } else {
+ MI.eraseFromParent();
}
break;
- case AMDGPU::S_CBRANCH_VCCZ:
- case AMDGPU::S_CBRANCH_VCCNZ:
- MadeChange |= optimizeVccBranch(MI);
- break;
-
default:
break;
}
}
}
+ for (MachineInstr *Kill : KillInstrs) {
+ skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
+ Kill->getDebugLoc());
+ Kill->eraseFromParent();
+ }
+ KillInstrs.clear();
+ EarlyExitBlock = nullptr;
+
return MadeChange;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ef662d55cb0a9..2a157eb20ab47 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -32,6 +32,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -57,7 +58,6 @@
#include <cstring>
#include <memory>
#include <utility>
-#include <vector>
using namespace llvm;
@@ -109,15 +109,13 @@ iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
enum_iterator<InstCounterType>(NUM_INST_CNTS));
}
-using RegInterval = std::pair<signed, signed>;
+using RegInterval = std::pair<int, int>;
struct {
- uint32_t VmcntMax;
- uint32_t ExpcntMax;
- uint32_t LgkmcntMax;
- uint32_t VscntMax;
- int32_t NumVGPRsMax;
- int32_t NumSGPRsMax;
+ unsigned VmcntMax;
+ unsigned ExpcntMax;
+ unsigned LgkmcntMax;
+ unsigned VscntMax;
} HardwareLimits;
struct {
@@ -143,7 +141,7 @@ enum WaitEventType {
NUM_WAIT_EVENTS,
};
-static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
(1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
(1 << SQ_MESSAGE),
@@ -166,6 +164,28 @@ enum RegisterMapping {
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};
+// Enumerate different types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+ // BUF instructions and MIMG instructions without a sampler.
+ VMEM_NOSAMPLER,
+ // MIMG instructions with a sampler.
+ VMEM_SAMPLER,
+};
+
+VmemType getVmemType(const MachineInstr &Inst) {
+ assert(SIInstrInfo::isVMEM(Inst));
+ if (!SIInstrInfo::isMIMG(Inst))
+ return VMEM_NOSAMPLER;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+ return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
+ ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
+}
+
void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
switch (T) {
case VM_CNT:
@@ -195,12 +215,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
- for (auto T : inst_counter_types())
- memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- }
+ WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
- static uint32_t getWaitCountMax(InstCounterType T) {
+ static unsigned getWaitCountMax(InstCounterType T) {
switch (T) {
case VM_CNT:
return HardwareLimits.VmcntMax;
@@ -216,17 +233,13 @@ public:
return 0;
}
- uint32_t getScoreLB(InstCounterType T) const {
+ unsigned getScoreLB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return 0;
return ScoreLBs[T];
}
- uint32_t getScoreUB(InstCounterType T) const {
+ unsigned getScoreUB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return 0;
return ScoreUBs[T];
}
@@ -242,7 +255,7 @@ public:
return EXP_CNT;
}
- uint32_t getRegScore(int GprNo, InstCounterType T) {
+ unsigned getRegScore(int GprNo, InstCounterType T) {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
@@ -250,30 +263,16 @@ public:
return SgprScores[GprNo - NUM_ALL_VGPRS];
}
- void clear() {
- memset(ScoreLBs, 0, sizeof(ScoreLBs));
- memset(ScoreUBs, 0, sizeof(ScoreUBs));
- PendingEvents = 0;
- memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
- for (auto T : inst_counter_types())
- memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- memset(SgprScores, 0, sizeof(SgprScores));
- }
-
bool merge(const WaitcntBrackets &Other);
RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI, unsigned OpNo,
- bool Def) const;
-
- int32_t getMaxVGPR() const { return VgprUB; }
- int32_t getMaxSGPR() const { return SgprUB; }
+ const SIRegisterInfo *TRI, unsigned OpNo) const;
bool counterOutOfOrder(InstCounterType T) const;
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
- void determineWait(InstCounterType T, uint32_t ScoreToWait,
+ void determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
@@ -286,6 +285,12 @@ public:
return PendingEvents & (1 << E);
}
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = PendingEvents & WaitEventMaskForInst[T];
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
bool hasPendingFlat() const {
return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
@@ -298,71 +303,77 @@ public:
LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
}
+ // Return true if there might be pending writes to the specified vgpr by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+ assert(GprNo < NUM_ALL_VGPRS);
+ return VgprVmemTypes[GprNo] & ~(1 << V);
+ }
+
+ void clearVgprVmemTypes(int GprNo) {
+ assert(GprNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[GprNo] = 0;
+ }
+
void print(raw_ostream &);
void dump() { print(dbgs()); }
private:
struct MergeInfo {
- uint32_t OldLB;
- uint32_t OtherLB;
- uint32_t MyShift;
- uint32_t OtherShift;
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
};
- static bool mergeScore(const MergeInfo &M, uint32_t &Score,
- uint32_t OtherScore);
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
- void setScoreLB(InstCounterType T, uint32_t Val) {
+ void setScoreLB(InstCounterType T, unsigned Val) {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return;
ScoreLBs[T] = Val;
}
- void setScoreUB(InstCounterType T, uint32_t Val) {
+ void setScoreUB(InstCounterType T, unsigned Val) {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return;
ScoreUBs[T] = Val;
if (T == EXP_CNT) {
- uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+ unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
ScoreLBs[T] = UB;
}
}
- void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
if (GprNo < NUM_ALL_VGPRS) {
- if (GprNo > VgprUB) {
- VgprUB = GprNo;
- }
+ VgprUB = std::max(VgprUB, GprNo);
VgprScores[T][GprNo] = Val;
} else {
assert(T == LGKM_CNT);
- if (GprNo - NUM_ALL_VGPRS > SgprUB) {
- SgprUB = GprNo - NUM_ALL_VGPRS;
- }
+ SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
}
}
void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
- unsigned OpNo, uint32_t Val);
+ unsigned OpNo, unsigned Val);
const GCNSubtarget *ST = nullptr;
- uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
- uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
- uint32_t PendingEvents = 0;
- bool MixedPendingEvents[NUM_INST_CNTS] = {false};
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
// Remember the last flat memory operation.
- uint32_t LastFlat[NUM_INST_CNTS] = {0};
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
// wait_cnt scores for every vgpr.
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int32_t VgprUB = 0;
- int32_t SgprUB = 0;
- uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
- uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -385,8 +396,7 @@ private:
explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
};
- std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
- DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
+ MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
// because of amdgpu-waitcnt-forcezero flag
@@ -464,10 +474,10 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const SIInstrInfo *TII,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
- unsigned OpNo, bool Def) const {
+ unsigned OpNo) const {
const MachineOperand &Op = MI->getOperand(OpNo);
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
- (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg()))
+ assert(Op.isReg());
+ if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
return {-1, -1};
// A use via a PW operand does not need a waitcnt.
@@ -475,29 +485,27 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(!Op.getSubReg() || !Op.isUndef());
RegInterval Result;
- const MachineRegisterInfo &MRIA = *MRI;
unsigned Reg = TRI->getEncodingValue(Op.getReg());
- if (TRI->isVGPR(MRIA, Op.getReg())) {
+ if (TRI->isVGPR(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
Result.first = Reg - RegisterEncoding.VGPR0;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
- } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
+ } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
assert(Result.first >= NUM_ALL_VGPRS &&
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
}
// TODO: Handle TTMP
- // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
+ // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
else
return {-1, -1};
- const MachineInstr &MIA = *MI;
- const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+ const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.second = Result.first + (Size / 32);
+ Result.second = Result.first + ((Size + 16) / 32);
return Result;
}
@@ -506,13 +514,10 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, unsigned OpNo,
- uint32_t Val) {
- RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
- LLVM_DEBUG({
- const MachineOperand &Opnd = MI->getOperand(OpNo);
- assert(TRI->isVGPR(*MRI, Opnd.getReg()));
- });
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ unsigned Val) {
+ RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
+ assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
}
}
@@ -521,19 +526,14 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- const MachineRegisterInfo &MRIA = *MRI;
InstCounterType T = eventCounter(E);
- uint32_t CurrScore = getScoreUB(T) + 1;
+ unsigned CurrScore = getScoreUB(T) + 1;
if (CurrScore == 0)
report_fatal_error("InsertWaitcnt score wraparound");
// PendingEvents and ScoreUB need to be update regardless if this event
// changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- if (!hasPendingEvent(E)) {
- if (PendingEvents & WaitEventMaskForInst[T])
- MixedPendingEvents[T] = true;
- PendingEvents |= 1 << E;
- }
+ PendingEvents |= 1 << E;
setScoreUB(T, CurrScore);
if (T == EXP_CNT) {
@@ -574,7 +574,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = Inst.getOperand(I);
- if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+ if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -622,7 +622,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &DefMO = Inst.getOperand(I);
if (DefMO.isReg() && DefMO.isDef() &&
- TRI->isVGPR(MRIA, DefMO.getReg())) {
+ TRI->isVGPR(*MRI, DefMO.getReg())) {
setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
CurrScore);
}
@@ -630,7 +630,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &MO = Inst.getOperand(I);
- if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+ if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -641,8 +641,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
unsigned OpNo;//TODO: find the OpNo for this operand;
- RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
- for (signed RegNo = Interval.first; RegNo < Interval.second;
+ RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
+ for (int RegNo = Interval.first; RegNo < Interval.second;
++RegNo) {
setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
}
@@ -650,10 +650,20 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
} else {
// Match the score to the destination registers.
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
- RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
- if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+ auto &Op = Inst.getOperand(I);
+ if (!Op.isReg() || !Op.isDef())
continue;
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
+ if (T == VM_CNT) {
+ if (Interval.first >= NUM_ALL_VGPRS)
+ continue;
+ if (SIInstrInfo::isVMEM(Inst)) {
+ VmemType V = getVmemType(Inst);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
+ VgprVmemTypes[RegNo] |= 1 << V;
+ }
+ }
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, T, CurrScore);
}
}
@@ -666,8 +676,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
void WaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
for (auto T : inst_counter_types()) {
- uint32_t LB = getScoreLB(T);
- uint32_t UB = getScoreUB(T);
+ unsigned LB = getScoreLB(T);
+ unsigned UB = getScoreUB(T);
switch (T) {
case VM_CNT:
@@ -689,11 +699,11 @@ void WaitcntBrackets::print(raw_ostream &OS) {
if (LB < UB) {
// Print vgpr scores.
- for (int J = 0; J <= getMaxVGPR(); J++) {
- uint32_t RegScore = getRegScore(J, T);
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned RegScore = getRegScore(J, T);
if (RegScore <= LB)
continue;
- uint32_t RelScore = RegScore - LB - 1;
+ unsigned RelScore = RegScore - LB - 1;
if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
OS << RelScore << ":v" << J << " ";
} else {
@@ -702,11 +712,11 @@ void WaitcntBrackets::print(raw_ostream &OS) {
}
// Also need to print sgpr scores for lgkm_cnt.
if (T == LGKM_CNT) {
- for (int J = 0; J <= getMaxSGPR(); J++) {
- uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+ for (int J = 0; J <= SgprUB; J++) {
+ unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
if (RegScore <= LB)
continue;
- uint32_t RelScore = RegScore - LB - 1;
+ unsigned RelScore = RegScore - LB - 1;
OS << RelScore << ":s" << J << " ";
}
}
@@ -727,8 +737,8 @@ bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
- const uint32_t LB = getScoreLB(T);
- const uint32_t UB = getScoreUB(T);
+ const unsigned LB = getScoreLB(T);
+ const unsigned UB = getScoreUB(T);
if (Count < UB && UB - Count > LB)
return true;
@@ -736,12 +746,12 @@ bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
return false;
}
-void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const {
// If the score of src_operand falls within the bracket, we need an
// s_waitcnt instruction.
- const uint32_t LB = getScoreLB(T);
- const uint32_t UB = getScoreUB(T);
+ const unsigned LB = getScoreLB(T);
+ const unsigned UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == VM_CNT || T == LGKM_CNT) &&
hasPendingFlat() &&
@@ -758,7 +768,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
} else {
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
- uint32_t NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -772,7 +782,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
- const uint32_t UB = getScoreUB(T);
+ const unsigned UB = getScoreUB(T);
if (Count >= UB)
return;
if (Count != 0) {
@@ -781,7 +791,6 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- MixedPendingEvents[T] = false;
PendingEvents &= ~WaitEventMaskForInst[T];
}
}
@@ -792,7 +801,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
return true;
- return MixedPendingEvents[T];
+ return hasMixedPendingEvents(T);
}
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -954,10 +963,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
int CallAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
- RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval(
- &MI, TII, MRI, TRI, CallAddrOpIdx, false);
+ RegInterval CallAddrOpInterval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
- for (signed RegNo = CallAddrOpInterval.first;
+ for (int RegNo = CallAddrOpInterval.first;
RegNo < CallAddrOpInterval.second; ++RegNo)
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
@@ -965,10 +974,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
int RtnAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
if (RtnAddrOpIdx != -1) {
- RegInterval RtnAddrOpInterval = ScoreBrackets.getRegInterval(
- &MI, TII, MRI, TRI, RtnAddrOpIdx, false);
+ RegInterval RtnAddrOpInterval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
- for (signed RegNo = RtnAddrOpInterval.first;
+ for (int RegNo = RtnAddrOpInterval.first;
RegNo < RtnAddrOpInterval.second; ++RegNo)
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
@@ -982,7 +991,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// emitted.
// If the source operand was defined by a load, add the s_waitcnt
// instruction.
+ //
+ // Two cases are handled for destination operands:
+ // 1) If the destination operand was defined by a load, add the s_waitcnt
+ // instruction to guarantee the right WAW order.
+ // 2) If a destination operand that was used by a recent export/store ins,
+ // add s_waitcnt on exp_cnt to guarantee the WAR order.
for (const MachineMemOperand *Memop : MI.memoperands()) {
+ const Value *Ptr = Memop->getValue();
+ if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
+ addWait(Wait, LGKM_CNT, 0);
+ if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
+ SLoadAddresses.erase(Ptr);
+ }
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
@@ -990,67 +1011,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
-
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- const MachineOperand &Op = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Op.getReg())) {
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
- }
- }
- // End of for loop that looks at all source operands to decide vm_wait_cnt
- // and lgk_wait_cnt.
-
- // Two cases are handled for destination operands:
- // 1) If the destination operand was defined by a load, add the s_waitcnt
- // instruction to guarantee the right WAW order.
- // 2) If a destination operand that was used by a recent export/store ins,
- // add s_waitcnt on exp_cnt to guarantee the WAR order.
- if (MI.mayStore()) {
- // FIXME: Should not be relying on memoperands.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- const Value *Ptr = Memop->getValue();
- if (SLoadAddresses.count(Ptr)) {
- addWait(Wait, LGKM_CNT, 0);
- if (PDT->dominates(MI.getParent(),
- SLoadAddresses.find(Ptr)->second))
- SLoadAddresses.erase(Ptr);
- }
- unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- continue;
- unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ if (Memop->isStore()) {
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
}
+
+ // Loop over use and def operands.
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- MachineOperand &Def = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg())
+ continue;
RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Def.getReg())) {
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- ScoreBrackets.determineWait(
- EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ if (TRI->isVGPR(*MRI, Op.getReg())) {
+ // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
+ // previous write and this write are the same type of VMEM
+ // instruction, in which case they're guaranteed to write their
+ // results in order anyway.
+ if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+ ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
+ getVmemType(MI))) {
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.clearVgprVmemTypes(RegNo);
+ }
+ if (Op.isDef()) {
+ ScoreBrackets.determineWait(
+ EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ }
}
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
- } // End of for loop that looks at all dest operands.
+ }
}
}
@@ -1154,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << '\n'
+ << "Old Instr: " << MI
<< "New Instr: " << *II << '\n');
if (!Wait.hasWait())
@@ -1171,7 +1166,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << '\n'
+ << "Old Instr: " << MI
<< "New Instr: " << *SWaitInst << '\n');
}
@@ -1187,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << '\n'
+ << "Old Instr: " << MI
<< "New Instr: " << *SWaitInst << '\n');
}
@@ -1303,10 +1298,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
}
}
-bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
- uint32_t OtherScore) {
- uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
- uint32_t OtherShifted =
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore) {
+ unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+ unsigned OtherShifted =
OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
Score = std::max(MyShifted, OtherShifted);
return OtherShifted > MyShifted;
@@ -1320,44 +1315,50 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
bool StrictDom = false;
+ VgprUB = std::max(VgprUB, Other.VgprUB);
+ SgprUB = std::max(SgprUB, Other.SgprUB);
+
for (auto T : inst_counter_types()) {
// Merge event flags for this counter
const bool OldOutOfOrder = counterOutOfOrder(T);
- const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
- const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+ const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
+ const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
StrictDom = true;
- if (Other.MixedPendingEvents[T] ||
- (OldEvents && OtherEvents && OldEvents != OtherEvents))
- MixedPendingEvents[T] = true;
PendingEvents |= OtherEvents;
// Merge scores for this counter
- const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
- const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+ const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
+ const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+ const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
+ if (NewUB < ScoreLBs[T])
+ report_fatal_error("waitcnt score overflow");
+
MergeInfo M;
M.OldLB = ScoreLBs[T];
M.OtherLB = Other.ScoreLBs[T];
- M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
- M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
+ M.MyShift = NewUB - ScoreUBs[T];
+ M.OtherShift = NewUB - Other.ScoreUBs[T];
- const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
- if (NewUB < ScoreUBs[T])
- report_fatal_error("waitcnt score overflow");
ScoreUBs[T] = NewUB;
- ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
bool RegStrictDom = false;
- for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
- J++) {
+ for (int J = 0; J <= VgprUB; J++) {
RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
}
+ if (T == VM_CNT) {
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+ RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
+ VgprVmemTypes[J] = NewVmemTypes;
+ }
+ }
+
if (T == LGKM_CNT) {
- for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
- J != E; J++) {
+ for (int J = 0; J <= SgprUB; J++) {
RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
}
}
@@ -1366,9 +1367,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
StrictDom = true;
}
- VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
- SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
-
return StrictDom;
}
@@ -1383,6 +1381,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
+ // Assume VCCZ is correct at basic block boundaries, unless and until we need
+ // to handle cases where that is not true.
+ bool VCCZCorrect = true;
+
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
@@ -1402,13 +1404,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
continue;
}
- bool VCCZBugWorkAround = false;
+ // We might need to restore vccz to its correct value for either of two
+ // different reasons; see ST->hasReadVCCZBug() and
+ // ST->partialVCCWritesUpdateVCCZ().
+ bool RestoreVCCZ = false;
if (readsVCCZ(Inst)) {
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
- ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- if (ST->hasReadVCCZBug())
- VCCZBugWorkAround = true;
+ if (!VCCZCorrect)
+ RestoreVCCZ = true;
+ else if (ST->hasReadVCCZBug()) {
+ // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+ // vccz bit, so when we detect that an instruction may read from a
+ // corrupt vccz bit, we need to:
+ // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
+ // operations to complete.
+ // 2. Restore the correct value of vccz by writing the current value
+ // of vcc back to vcc.
+ if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ RestoreVCCZ = true;
+ }
}
}
@@ -1419,6 +1434,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
}
+ if (!ST->partialVCCWritesUpdateVCCZ()) {
+ // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ // Writes to vcc will fix it.
+ if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+ Inst.definesRegister(AMDGPU::VCC_HI))
+ VCCZCorrect = false;
+ else if (Inst.definesRegister(AMDGPU::VCC))
+ VCCZCorrect = true;
+ }
+
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1444,7 +1469,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
- if (VCCZBugWorkAround) {
+ if (RestoreVCCZ) {
// Restore the vccz bit. Any time a value is written to vcc, the vcc
// bit is updated, so we can restore the bit by reading the value of
// vcc and then writing it back to the register.
@@ -1452,6 +1477,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
TRI->getVCC())
.addReg(TRI->getVCC());
+ VCCZCorrect = true;
Modified = true;
}
@@ -1479,29 +1505,23 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
- HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
- HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
- assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
- assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+ unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+ unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
+ assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+ assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
- RegisterEncoding.VGPRL =
- RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
+ RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
- RegisterEncoding.SGPRL =
- RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+ RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
TrackedWaitcntSet.clear();
- RpotIdxMap.clear();
BlockInfos.clear();
// Keep iterating over the blocks in reverse post order, inserting and
// updating s_waitcnt where needed, until a fix point is reached.
- for (MachineBasicBlock *MBB :
- ReversePostOrderTraversal<MachineFunction *>(&MF)) {
- RpotIdxMap[MBB] = BlockInfos.size();
- BlockInfos.emplace_back(MBB);
- }
+ for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
+ BlockInfos.insert({MBB, BlockInfo(MBB)});
std::unique_ptr<WaitcntBrackets> Brackets;
bool Modified = false;
@@ -1509,12 +1529,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
do {
Repeat = false;
- for (BlockInfo &BI : BlockInfos) {
+ for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
+ ++BII) {
+ BlockInfo &BI = BII->second;
if (!BI.Dirty)
continue;
- unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
-
if (BI.Incoming) {
if (!Brackets)
Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
@@ -1524,7 +1544,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!Brackets)
Brackets = std::make_unique<WaitcntBrackets>(ST);
else
- Brackets->clear();
+ *Brackets = WaitcntBrackets(ST);
}
Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
@@ -1533,11 +1553,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (Brackets->hasPending()) {
BlockInfo *MoveBracketsToSucc = nullptr;
for (MachineBasicBlock *Succ : BI.MBB->successors()) {
- unsigned SuccIdx = RpotIdxMap[Succ];
- BlockInfo &SuccBI = BlockInfos[SuccIdx];
+ auto SuccBII = BlockInfos.find(Succ);
+ BlockInfo &SuccBI = SuccBII->second;
if (!SuccBI.Incoming) {
SuccBI.Dirty = true;
- if (SuccIdx <= Idx)
+ if (SuccBII <= BII)
Repeat = true;
if (!MoveBracketsToSucc) {
MoveBracketsToSucc = &SuccBI;
@@ -1546,7 +1566,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
} else if (SuccBI.Incoming->merge(*Brackets)) {
SuccBI.Dirty = true;
- if (SuccIdx <= Idx)
+ if (SuccBII <= BII)
Repeat = true;
}
}
@@ -1612,13 +1632,15 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();
+ MachineBasicBlock::iterator I = EntryBB.begin();
+ for (MachineBasicBlock::iterator E = EntryBB.end();
+ I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+ ;
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
if (ST->hasVscnt())
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
Modified = true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 4dcbe92861f23..428c21c896d50 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -114,6 +114,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
field bit IsNonFlatSeg = 0;
+ // Reads the mode register, usually for FP environment.
+ field bit ReadsModeReg = 0;
+
// This bit indicates that this uses the floating point double precision
// rounding mode flags
field bit FPDPRounding = 0;
@@ -303,7 +306,7 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
bits<3> dim;
bits<2> nsa;
bits<1> dlc;
- bits<1> a16 = 0; // TODO: this should be an operand
+ bits<1> a16;
let Inst{0} = op{7};
let Inst{2-1} = nsa;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d53950ca44655..9af8ffedce0f3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,6 +63,8 @@
using namespace llvm;
+#define DEBUG_TYPE "si-instr-info"
+
#define GET_INSTRINFO_CTOR_DTOR
#include "AMDGPUGenInstrInfo.inc"
@@ -83,6 +85,12 @@ static cl::opt<unsigned>
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
cl::desc("Restrict range of branch instructions (DEBUG)"));
+static cl::opt<bool> Fix16BitCopies(
+ "amdgpu-fix-16-bit-physreg-copies",
+ cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
+ cl::init(true),
+ cl::ReallyHidden);
+
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
@@ -136,6 +144,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_ACCVGPR_READ_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
// No implicit operands.
return MI.getNumOperands() == MI.getDesc().getNumOperands();
default:
@@ -258,43 +268,49 @@ static bool isStride64(unsigned Opc) {
}
}
-bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
- const MachineOperand *&BaseOp,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const {
+bool SIInstrInfo::getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const {
if (!LdSt.mayLoadOrStore())
return false;
unsigned Opc = LdSt.getOpcode();
+ OffsetIsScalable = false;
+ const MachineOperand *BaseOp, *OffsetOp;
+ int DataOpIdx;
if (isDS(LdSt)) {
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- if (OffsetImm) {
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+ OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ if (OffsetOp) {
// Normal, single offset LDS instruction.
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
- // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
- // report that here?
- if (!BaseOp || !BaseOp->isReg())
+ if (!BaseOp) {
+ // DS_CONSUME/DS_APPEND use M0 for the base address.
+ // TODO: find the implicit use operand for M0 and use that as BaseOp?
+ return false;
+ }
+ BaseOps.push_back(BaseOp);
+ Offset = OffsetOp->getImm();
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1)
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ Width = getOpSize(LdSt, DataOpIdx);
+ } else {
+ // The 2 offset instructions use offset0 and offset1 instead. We can treat
+ // these as a load with a single offset if the 2 offsets are consecutive.
+ // We will use this for some partially aligned loads.
+ const MachineOperand *Offset0Op =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+ const MachineOperand *Offset1Op =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset1);
+
+ unsigned Offset0 = Offset0Op->getImm();
+ unsigned Offset1 = Offset1Op->getImm();
+ if (Offset0 + 1 != Offset1)
return false;
- Offset = OffsetImm->getImm();
-
- return true;
- }
-
- // The 2 offset instructions use offset0 and offset1 instead. We can treat
- // these as a load with a single offset if the 2 offsets are consecutive. We
- // will use this for some partially aligned loads.
- const MachineOperand *Offset0Imm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset0);
- const MachineOperand *Offset1Imm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset1);
-
- uint8_t Offset0 = Offset0Imm->getImm();
- uint8_t Offset1 = Offset1Imm->getImm();
-
- if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
// Each of these offsets is in element sized units, so we need to convert
// to bytes of the individual reads.
@@ -310,16 +326,20 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
if (isStride64(Opc))
EltSize *= 64;
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
- if (!BaseOp->isReg())
- return false;
-
+ BaseOps.push_back(BaseOp);
Offset = EltSize * Offset0;
-
- return true;
+ // Get appropriate operand(s), and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1) {
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ Width = getOpSize(LdSt, DataOpIdx);
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+ Width += getOpSize(LdSt, DataOpIdx);
+ } else {
+ Width = getOpSize(LdSt, DataOpIdx);
+ }
}
-
- return false;
+ return true;
}
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
@@ -339,59 +359,78 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
const MachineOperand *OffsetImm =
getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseOp = SOffset;
+ BaseOps.push_back(RSrc);
+ BaseOps.push_back(SOffset);
Offset = OffsetImm->getImm();
- return true;
- }
-
- const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (!AddrReg)
- return false;
+ } else {
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+ if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL
+ return false;
+ BaseOps.push_back(BaseOp);
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- BaseOp = AddrReg;
- Offset = OffsetImm->getImm();
- if (SOffset) // soffset can be an inline immediate.
- Offset += SOffset->getImm();
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (BaseOp)
+ BaseOps.push_back(BaseOp);
- if (!BaseOp->isReg())
- return false;
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ Offset = OffsetImm->getImm();
+ if (SOffset) // soffset can be an inline immediate.
+ Offset += SOffset->getImm();
+ }
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1)
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ Width = getOpSize(LdSt, DataOpIdx);
+ return true;
+ }
+ if (isMIMG(LdSt)) {
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ if (VAddr0Idx >= 0) {
+ // GFX10 possible NSA encoding.
+ for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
+ BaseOps.push_back(&LdSt.getOperand(I));
+ } else {
+ BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
+ }
+ Offset = 0;
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isSMRD(LdSt)) {
- const MachineOperand *OffsetImm =
- getNamedOperand(LdSt, AMDGPU::OpName::offset);
- if (!OffsetImm)
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+ if (!BaseOp) // e.g. S_MEMTIME
return false;
-
- const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
- BaseOp = SBaseReg;
- Offset = OffsetImm->getImm();
- if (!BaseOp->isReg())
- return false;
-
+ BaseOps.push_back(BaseOp);
+ OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ Offset = OffsetOp ? OffsetOp->getImm() : 0;
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+ Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isFLAT(LdSt)) {
- const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
- if (VAddr) {
- // Can't analyze 2 offsets.
- if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
- return false;
-
- BaseOp = VAddr;
- } else {
- // scratch instructions have either vaddr or saddr.
- BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
- }
-
+ // Instructions have either vaddr or saddr or both.
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (BaseOp)
+ BaseOps.push_back(BaseOp);
+ BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
+ if (BaseOp)
+ BaseOps.push_back(BaseOp);
Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
- if (!BaseOp->isReg())
- return false;
+ // Get appropriate operand, and compute width accordingly.
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DataOpIdx == -1)
+ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+ Width = getOpSize(LdSt, DataOpIdx);
return true;
}
@@ -399,15 +438,13 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
}
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
- const MachineOperand &BaseOp1,
+ ArrayRef<const MachineOperand *> BaseOps1,
const MachineInstr &MI2,
- const MachineOperand &BaseOp2) {
- // Support only base operands with base registers.
- // Note: this could be extended to support FI operands.
- if (!BaseOp1.isReg() || !BaseOp2.isReg())
- return false;
-
- if (BaseOp1.isIdenticalTo(BaseOp2))
+ ArrayRef<const MachineOperand *> BaseOps2) {
+ // Only examine the first "base" operand of each instruction, on the
+ // assumption that it represents the real base address of the memory access.
+ // Other operands are typically offsets or indices from this base address.
+ if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
return true;
if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
@@ -433,62 +470,31 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
return Base1 == Base2;
}
-bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
- const MachineOperand &BaseOp2,
- unsigned NumLoads) const {
- const MachineInstr &FirstLdSt = *BaseOp1.getParent();
- const MachineInstr &SecondLdSt = *BaseOp2.getParent();
-
- if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
- return false;
-
- const MachineOperand *FirstDst = nullptr;
- const MachineOperand *SecondDst = nullptr;
-
- if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
- (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
- (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
- const unsigned MaxGlobalLoadCluster = 6;
- if (NumLoads > MaxGlobalLoadCluster)
- return false;
-
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
- if (!FirstDst)
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
- if (!SecondDst)
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
- } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
- FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
- SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
- }
-
- if (!FirstDst || !SecondDst)
+bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2,
+ unsigned NumLoads,
+ unsigned NumBytes) const {
+ // If current mem ops pair do not have same base pointer, then they cannot be
+ // clustered.
+ assert(!BaseOps1.empty() && !BaseOps2.empty());
+ const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+ const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
- // Try to limit clustering based on the total number of bytes loaded
- // rather than the number of instructions. This is done to help reduce
- // register pressure. The method used is somewhat inexact, though,
- // because it assumes that all loads in the cluster will load the
- // same number of bytes as FirstLdSt.
-
- // The unit of this value is bytes.
- // FIXME: This needs finer tuning.
- unsigned LoadClusterThreshold = 16;
-
- const MachineRegisterInfo &MRI =
- FirstLdSt.getParent()->getParent()->getRegInfo();
-
- const Register Reg = FirstDst->getReg();
-
- const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
- ? MRI.getRegClass(Reg)
- : RI.getPhysRegClass(Reg);
-
- return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
+ // Compute max cluster size based on average number bytes clustered till now,
+ // and decide based on it, if current mem ops pair can be clustered or not.
+ assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
+ "Invalid NumLoads/NumBytes values");
+ unsigned MaxNumLoads;
+ if (NumBytes <= 4 * NumLoads) {
+ // Loads are dword or smaller (on average).
+ MaxNumLoads = 5;
+ } else {
+ // Loads are bigger than a dword (on average).
+ MaxNumLoads = 4;
+ }
+ return NumLoads <= MaxNumLoads;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
@@ -516,11 +522,10 @@ bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
- MCRegister SrcReg, bool KillSrc) {
+ MCRegister SrcReg, bool KillSrc,
+ const char *Msg = "illegal SGPR to VGPR copy") {
MachineFunction *MF = MBB.getParent();
- DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
- "illegal SGPR to VGPR copy",
- DL, DS_Error);
+ DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
LLVMContext &C = MF->getFunction().getContext();
C.diagnose(IllegalCopy);
@@ -534,6 +539,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
+ // FIXME: This is hack to resolve copies between 16 bit and 32 bit
+ // registers until all patterns are fixed.
+ if (Fix16BitCopies &&
+ ((RI.getRegSizeInBits(*RC) == 16) ^
+ (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
+ MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
+ MCRegister Super = RI.get32BitRegister(RegToFix);
+ assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
+ RegToFix = Super;
+
+ if (DestReg == SrcReg) {
+ // Insert empty bundle since ExpandPostRA expects an instruction here.
+ BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
+ return;
+ }
+
+ RC = RI.getPhysRegClass(DestReg);
+ }
+
if (RC == &AMDGPU::VGPR_32RegClass) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
AMDGPU::SReg_32RegClass.contains(SrcReg) ||
@@ -580,6 +604,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (RC == &AMDGPU::SReg_64RegClass) {
+ if (SrcReg == AMDGPU::SCC) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
+ .addImm(1)
+ .addImm(0);
+ return;
+ }
+
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
@@ -606,10 +637,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (DestReg == AMDGPU::SCC) {
+ // Copying 64-bit or 32-bit sources to SCC barely makes sense,
+ // but SelectionDAG emits such copies for i1 sources.
+ // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit.
+ if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0);
+ }
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
- .addReg(SrcReg, getKillRegState(KillSrc))
- .addImm(0);
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0);
+
return;
}
@@ -660,7 +699,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Registers in the sequence are allocated contiguously so we can just
// use register number to pick one of three round-robin temps.
unsigned RegNo = DestReg % 3;
- unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
if (!Tmp)
report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
RS.setRegUsed(Tmp);
@@ -685,6 +724,72 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (RI.getRegSizeInBits(*RC) == 16) {
+ assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
+
+ bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
+ bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
+ bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
+ bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+ bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
+ AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(DestReg);
+ bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
+ AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+ MCRegister NewDestReg = RI.get32BitRegister(DestReg);
+ MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
+
+ if (IsSGPRDst) {
+ if (!IsSGPRSrc) {
+ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+ return;
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
+ .addReg(NewSrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ if (IsAGPRDst || IsAGPRSrc) {
+ if (!DstLow || !SrcLow) {
+ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
+ "Cannot use hi16 subreg with an AGPR!");
+ }
+
+ copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
+ return;
+ }
+
+ if (IsSGPRSrc && !ST.hasSDWAScalar()) {
+ if (!DstLow || !SrcLow) {
+ reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
+ "Cannot use hi16 subreg on VI!");
+ }
+
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
+ .addReg(NewSrcReg, getKillRegState(KillSrc));
+ return;
+ }
+
+ auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
+ .addImm(0) // src0_modifiers
+ .addReg(NewSrcReg)
+ .addImm(0) // clamp
+ .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+ : AMDGPU::SDWA::SdwaSel::WORD_1)
+ .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
+ .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+ : AMDGPU::SDWA::SdwaSel::WORD_1)
+ .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
+ // First implicit operand is $exec.
+ MIB->tieOperands(0, MIB->getNumOperands() - 1);
+ return;
+ }
+
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
@@ -806,7 +911,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
int64_t IdxValue = Idx == 0 ? Value : 0;
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
- get(Opcode), RI.getSubReg(DestReg, Idx));
+ get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
Builder.addImm(IdxValue);
}
}
@@ -818,10 +923,10 @@ SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DstReg,
+ const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond,
- unsigned TrueReg,
- unsigned FalseReg) const {
+ Register TrueReg,
+ Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -944,10 +1049,10 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
}
}
-unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned SrcReg, int Value) const {
+ Register SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
@@ -957,10 +1062,10 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
return Reg;
}
-unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned SrcReg, int Value) const {
+ Register SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
@@ -984,6 +1089,80 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
return AMDGPU::COPY;
}
+static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) {
+ if (VecSize <= 32) // 4 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1;
+ if (VecSize <= 64) // 8 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2;
+ if (VecSize <= 96) // 12 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3;
+ if (VecSize <= 128) // 16 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4;
+ if (VecSize <= 160) // 20 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5;
+ if (VecSize <= 256) // 32 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8;
+ if (VecSize <= 512) // 64 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16;
+ if (VecSize <= 1024) // 128 bytes
+ return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32;
+
+ llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) {
+ if (VecSize <= 32) // 4 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1;
+ if (VecSize <= 64) // 8 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2;
+ if (VecSize <= 96) // 12 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3;
+ if (VecSize <= 128) // 16 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4;
+ if (VecSize <= 160) // 20 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5;
+ if (VecSize <= 256) // 32 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8;
+ if (VecSize <= 512) // 64 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16;
+ if (VecSize <= 1024) // 128 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32;
+
+ llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) {
+ if (VecSize <= 64) // 8 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1;
+ if (VecSize <= 128) // 16 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2;
+ if (VecSize <= 256) // 32 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4;
+ if (VecSize <= 512) // 64 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8;
+ if (VecSize <= 1024) // 128 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16;
+
+ llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo(
+ unsigned VecSize, unsigned EltSize, bool IsSGPR) const {
+ if (IsSGPR) {
+ switch (EltSize) {
+ case 32:
+ return get(getIndirectSGPRWritePseudo32(VecSize));
+ case 64:
+ return get(getIndirectSGPRWritePseudo64(VecSize));
+ default:
+ llvm_unreachable("invalid reg indexing elt size");
+ }
+ }
+
+ assert(EltSize == 32 && "invalid reg indexing elt size");
+ return get(getIndirectVGPRWritePseudoOpc(VecSize));
+}
+
static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
@@ -996,6 +1175,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S128_SAVE;
case 20:
return AMDGPU::SI_SPILL_S160_SAVE;
+ case 24:
+ return AMDGPU::SI_SPILL_S192_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
@@ -1019,6 +1200,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V128_SAVE;
case 20:
return AMDGPU::SI_SPILL_V160_SAVE;
+ case 24:
+ return AMDGPU::SI_SPILL_V192_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
@@ -1049,7 +1232,7 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill,
+ Register SrcReg, bool isKill,
int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
@@ -1058,18 +1241,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned Size = FrameInfo.getObjectSize(FrameIndex);
- unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
+ FrameInfo.getObjectAlign(FrameIndex));
unsigned SpillSize = TRI->getSpillSize(*RC);
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
+ assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
+ SrcReg != AMDGPU::EXEC && "exec should not be spilled");
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling SGPRs.
@@ -1079,7 +1262,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// to make sure we are using the correct register class.
if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
BuildMI(MBB, MI, DL, OpDesc)
@@ -1126,6 +1309,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_S160_RESTORE;
+ case 24:
+ return AMDGPU::SI_SPILL_S192_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
@@ -1149,6 +1334,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_V128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_V160_RESTORE;
+ case 24:
+ return AMDGPU::SI_SPILL_V192_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
@@ -1179,33 +1366,34 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- unsigned DestReg, int FrameIndex,
+ Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
- unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
- unsigned Size = FrameInfo.getObjectSize(FrameIndex);
unsigned SpillSize = TRI->getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+ PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
+ FrameInfo.getObjectAlign(FrameIndex));
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
+ assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
+ DestReg != AMDGPU::EXEC && "exec should not be spilled");
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
- if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
+ if (DestReg.isVirtual() && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
if (RI.spillSGPRToVGPR())
@@ -1244,7 +1432,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
- unsigned TIDReg = MFI->getTIDReg();
+ Register TIDReg = MFI->getTIDReg();
if (!MFI->hasCalculatedTID()) {
MachineBasicBlock &Entry = MBB.getParent()->front();
MachineBasicBlock::iterator Insert = Entry.front();
@@ -1272,8 +1460,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
RS->enterBasicBlock(Entry);
// FIXME: Can we scavenge an SReg_64 and access the subregs?
- unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
- unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
.addReg(InputPtrReg)
.addImm(SI::KernelInputOffsets::NGROUPS_Z);
@@ -1482,30 +1670,55 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_MOVRELD_B32_V1:
- case AMDGPU::V_MOVRELD_B32_V2:
- case AMDGPU::V_MOVRELD_B32_V4:
- case AMDGPU::V_MOVRELD_B32_V8:
- case AMDGPU::V_MOVRELD_B32_V16: {
- const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16:
+ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: {
+ const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
+
+ unsigned Opc;
+ if (RI.hasVGPRs(EltRC)) {
+ Opc = ST.useVGPRIndexMode() ?
+ AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32;
+ } else {
+ Opc = RI.getRegSizeInBits(*EltRC) == 64 ?
+ AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32;
+ }
+
+ const MCInstrDesc &OpDesc = get(Opc);
Register VecReg = MI.getOperand(0).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
- unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
+ unsigned SubReg = MI.getOperand(3).getImm();
assert(VecReg == MI.getOperand(1).getReg());
- MachineInstr *MovRel =
- BuildMI(MBB, MI, DL, MovRelDesc)
- .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
- .add(MI.getOperand(2))
- .addReg(VecReg, RegState::ImplicitDefine)
- .addReg(VecReg,
- RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, DL, OpDesc)
+ .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+ .add(MI.getOperand(2))
+ .addReg(VecReg, RegState::ImplicitDefine)
+ .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
const int ImpDefIdx =
- MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
+ OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
const int ImpUseIdx = ImpDefIdx + 1;
- MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
-
+ MIB->tieOperands(ImpDefIdx, ImpUseIdx);
MI.eraseFromParent();
break;
}
@@ -1549,22 +1762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case TargetOpcode::BUNDLE: {
- if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
- return false;
-
- // If it is a load it must be a memory clause
- for (MachineBasicBlock::instr_iterator I = MI.getIterator();
- I->isBundledWithSucc(); ++I) {
- I->unbundleFromSucc();
- for (MachineOperand &MO : I->operands())
- if (MO.isReg())
- MO.setIsInternalRead(false);
- }
-
- MI.eraseFromParent();
- break;
- }
}
return true;
}
@@ -1662,9 +1859,15 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
RegOp.ChangeToImmediate(NonRegOp.getImm());
else if (NonRegOp.isFI())
RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
- else
+ else if (NonRegOp.isGlobal()) {
+ RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
+ NonRegOp.getTargetFlags());
+ } else
return nullptr;
+ // Make sure we don't reinterpret a subreg index in the target flags.
+ RegOp.setTargetFlags(NonRegOp.getTargetFlags());
+
NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
NonRegOp.setSubReg(SubReg);
@@ -2085,6 +2288,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
// Copy the flags onto the implicit condition register operand.
preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
+ fixImplicitOperands(*CondBr);
if (BytesAdded)
*BytesAdded = 4;
@@ -2125,8 +2329,8 @@ bool SIInstrInfo::reverseBranchCondition(
bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg,
- int &CondCycles,
+ Register DstReg, Register TrueReg,
+ Register FalseReg, int &CondCycles,
int &TrueCycles, int &FalseCycles) const {
switch (Cond[0].getImm()) {
case VCCNZ:
@@ -2165,8 +2369,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned DstReg, ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg) const {
+ Register DstReg, ArrayRef<MachineOperand> Cond,
+ Register TrueReg, Register FalseReg) const {
BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
if (Pred == VCCZ || Pred == SCC_FALSE) {
Pred = static_cast<BranchPredicate>(-Pred);
@@ -2178,14 +2382,17 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
unsigned DstSize = RI.getRegSizeInBits(*DstRC);
if (DstSize == 32) {
- unsigned SelOp = Pred == SCC_TRUE ?
- AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
-
- // Instruction's operands are backwards from what is expected.
- MachineInstr *Select =
- BuildMI(MBB, I, DL, get(SelOp), DstReg)
- .addReg(FalseReg)
- .addReg(TrueReg);
+ MachineInstr *Select;
+ if (Pred == SCC_TRUE) {
+ Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
+ .addReg(TrueReg)
+ .addReg(FalseReg);
+ } else {
+ // Instruction's operands are backwards from what is expected.
+ Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg);
+ }
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
return;
@@ -2194,8 +2401,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
if (DstSize == 64 && Pred == SCC_TRUE) {
MachineInstr *Select =
BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
- .addReg(FalseReg)
- .addReg(TrueReg);
+ .addReg(TrueReg)
+ .addReg(FalseReg);
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
return;
@@ -2239,17 +2446,26 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
I = MIB->getIterator();
- SmallVector<unsigned, 8> Regs;
+ SmallVector<Register, 8> Regs;
for (int Idx = 0; Idx != NElts; ++Idx) {
Register DstElt = MRI.createVirtualRegister(EltRC);
Regs.push_back(DstElt);
unsigned SubIdx = SubIndices[Idx];
- MachineInstr *Select =
- BuildMI(MBB, I, DL, get(SelOp), DstElt)
- .addReg(FalseReg, 0, SubIdx)
- .addReg(TrueReg, 0, SubIdx);
+ MachineInstr *Select;
+ if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
+ Select =
+ BuildMI(MBB, I, DL, get(SelOp), DstElt)
+ .addReg(FalseReg, 0, SubIdx)
+ .addReg(TrueReg, 0, SubIdx);
+ } else {
+ Select =
+ BuildMI(MBB, I, DL, get(SelOp), DstElt)
+ .addReg(TrueReg, 0, SubIdx)
+ .addReg(FalseReg, 0, SubIdx);
+ }
+
preserveCondRegFlags(Select->getOperand(3), Cond[1]);
fixImplicitOperands(*Select);
@@ -2313,7 +2529,7 @@ static void removeModOperands(MachineInstr &MI) {
}
bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
- unsigned Reg, MachineRegisterInfo *MRI) const {
+ Register Reg, MachineRegisterInfo *MRI) const {
if (!MRI->hasOneNonDBGUse(Reg))
return false;
@@ -2339,15 +2555,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned Opc = UseMI.getOpcode();
if (Opc == AMDGPU::COPY) {
- bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+ Register DstReg = UseMI.getOperand(0).getReg();
+ bool Is16Bit = getOpSize(UseMI, 0) == 2;
+ bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
- if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
- if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32))
+ APInt Imm(32, ImmOp->getImm());
+
+ if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
+ Imm = Imm.ashr(16);
+
+ if (RI.isAGPR(*MRI, DstReg)) {
+ if (!isInlineConstant(Imm))
return false;
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
}
+
+ if (Is16Bit) {
+ if (isVGPRCopy)
+ return false; // Do not clobber vgpr_hi16
+
+ if (DstReg.isVirtual() &&
+ UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+ return false;
+
+ UseMI.getOperand(0).setSubReg(0);
+ if (DstReg.isPhysical()) {
+ DstReg = RI.get32BitRegister(DstReg);
+ UseMI.getOperand(0).setReg(DstReg);
+ }
+ assert(UseMI.getOperand(1).getReg().isVirtual());
+ }
+
UseMI.setDesc(get(NewOpc));
- UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+ UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
+ UseMI.getOperand(1).setTargetFlags(0);
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
return true;
}
@@ -2517,6 +2758,18 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
return false;
}
+static bool
+memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2) {
+ if (BaseOps1.size() != BaseOps2.size())
+ return false;
+ for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
+ if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
+ return false;
+ }
+ return true;
+}
+
static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
int WidthB, int OffsetB) {
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -2527,26 +2780,26 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
const MachineInstr &MIb) const {
- const MachineOperand *BaseOp0, *BaseOp1;
+ SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
int64_t Offset0, Offset1;
+ unsigned Dummy0, Dummy1;
+ bool Offset0IsScalable, Offset1IsScalable;
+ if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
+ Dummy0, &RI) ||
+ !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
+ Dummy1, &RI))
+ return false;
- if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
- getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
- if (!BaseOp0->isIdenticalTo(*BaseOp1))
- return false;
+ if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
+ return false;
- if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
- // FIXME: Handle ds_read2 / ds_write2.
- return false;
- }
- unsigned Width0 = (*MIa.memoperands_begin())->getSize();
- unsigned Width1 = (*MIb.memoperands_begin())->getSize();
- if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
- return true;
- }
+ if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+ // FIXME: Handle ds_read2 / ds_write2.
+ return false;
}
-
- return false;
+ unsigned Width0 = MIa.memoperands().front()->getSize();
+ unsigned Width1 = MIb.memoperands().front()->getSize();
+ return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
}
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
@@ -2586,7 +2839,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isSMRD(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
+ return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
}
if (isFLAT(MIa)) {
@@ -2732,16 +2985,30 @@ static bool changesVGPRIndexingMode(const MachineInstr &MI) {
bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
- // XXX - Do we want the SP check in the base implementation?
+ // Skipping the check for SP writes in the base implementation. The reason it
+ // was added was apparently due to compile time concerns.
+ //
+ // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
+ // but is probably avoidable.
+
+ // Copied from base implementation.
+ // Terminators and labels can't be scheduled around.
+ if (MI.isTerminator() || MI.isPosition())
+ return true;
+
+ // INLINEASM_BR can jump to another block
+ if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+ return true;
// Target-independent instructions do not have an implicit-use of EXEC, even
// when they operate on VGPRs. Treating EXEC modifications as scheduling
// boundaries prevents incorrect movements of such instructions.
- return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
- MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+
+ // TODO: Don't treat setreg with known constant that only changes MODE as
+ // barrier.
+ return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
- MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
changesVGPRIndexingMode(MI);
}
@@ -2755,6 +3022,20 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_GWS_BARRIER;
}
+bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
+ // Skip the full operand and register alias search modifiesRegister
+ // does. There's only a handful of instructions that touch this, it's only an
+ // implicit def, and doesn't alias any other registers.
+ if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
+ for (; ImpDef && *ImpDef; ++ImpDef) {
+ if (*ImpDef == AMDGPU::MODE)
+ return true;
+ }
+ }
+
+ return false;
+}
+
bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
@@ -2780,6 +3061,10 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
if (MI.isCall() || MI.isInlineAsm())
return true; // conservative assumption
+ // A mode change is a scalar operation that influences vector instructions.
+ if (modifiesModeRegister(MI))
+ return true;
+
// These are like SALU instructions in terms of effects, so it's questionable
// whether we should return true for those.
//
@@ -2866,10 +3151,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());
case AMDGPU::OPERAND_REG_IMM_INT16:
- case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ // We would expect inline immediates to not be concerned with an integer/fp
+ // distinction. However, in the case of 16-bit integer operations, the
+ // "floating point" values appear to not work. It seems read the low 16-bits
+ // of 32-bit immediates, which happens to always work for the integer
+ // values.
+ //
+ // See llvm bugzilla 46302.
+ //
+ // TODO: Theoretically we could use op-sel to use the high bits of the
+ // 32-bit FP values.
+ return AMDGPU::isInlinableIntLiteral(Imm);
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ // This suffers the same problem as the scalar 16-bit cases.
+ return AMDGPU::isInlinableIntLiteralV216(Imm);
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
@@ -2883,11 +3184,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint32_t Trunc = static_cast<uint32_t>(Imm);
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
@@ -3056,7 +3354,8 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
const MachineOperand &Orig) {
for (MachineOperand &Use : MI.implicit_operands()) {
- if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
+ if (Use.isUse() &&
+ (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
Use.setIsUndef(Orig.isUndef());
Use.setIsKill(Orig.isKill());
return;
@@ -3068,7 +3367,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
unsigned Op32) const {
MachineBasicBlock *MBB = MI.getParent();;
MachineInstrBuilder Inst32 =
- BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
+ BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
+ .setMIFlags(MI.getFlags());
// Add the dst operand if the 32-bit encoding also has an explicit $vdst.
// For VOPC instructions, this is replaced by an implicit def of vcc.
@@ -3138,7 +3438,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
}
}
-static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+static Register findImplicitSGPRRead(const MachineInstr &MI) {
for (const MachineOperand &MO : MI.implicit_operands()) {
// We only care about reads.
if (MO.isDef())
@@ -3239,6 +3539,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
}
+ if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
+ ErrInfo = "missing memory operand from MIMG instruction.";
+ return false;
+ }
+
// Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
if (MI.getOperand(i).isFPImm()) {
@@ -3446,8 +3751,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
++ConstantBusCount;
- SmallVector<unsigned, 2> SGPRsUsed;
- unsigned SGPRUsed = findImplicitSGPRRead(MI);
+ SmallVector<Register, 2> SGPRsUsed;
+ Register SGPRUsed = findImplicitSGPRRead(MI);
if (SGPRUsed != AMDGPU::NoRegister) {
++ConstantBusCount;
SGPRsUsed.push_back(SGPRUsed);
@@ -3482,7 +3787,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (isVOP3(MI) && LiteralCount) {
- if (LiteralCount && !ST.hasVOP3Literal()) {
+ if (!ST.hasVOP3Literal()) {
ErrInfo = "VOP3 instruction uses literal";
return false;
}
@@ -3665,11 +3970,34 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return false;
}
+ bool IsA16 = false;
+ if (ST.hasR128A16()) {
+ const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
+ IsA16 = R128A16->getImm() != 0;
+ } else if (ST.hasGFX10A16()) {
+ const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
+ IsA16 = A16->getImm() != 0;
+ }
+
+ bool PackDerivatives = IsA16 || BaseOpcode->G16;
bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
- unsigned AddrWords = BaseOpcode->NumExtraArgs +
- (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
- (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
- (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+
+ unsigned AddrWords = BaseOpcode->NumExtraArgs;
+ unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ if (IsA16)
+ AddrWords += (AddrComponents + 1) / 2;
+ else
+ AddrWords += AddrComponents;
+
+ if (BaseOpcode->Gradients) {
+ if (PackDerivatives)
+ // There are two gradients per coordinate, we pack them separately.
+ // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
+ AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2;
+ else
+ AddrWords += Dim->NumGradients;
+ }
unsigned VAddrWords;
if (IsNSA) {
@@ -3681,14 +4009,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
AddrWords = 16;
else if (AddrWords > 4)
AddrWords = 8;
- else if (AddrWords == 3 && VAddrWords == 4) {
- // CodeGen uses the V4 variant of instructions for three addresses,
- // because the selection DAG does not support non-power-of-two types.
+ else if (AddrWords == 4)
AddrWords = 4;
- }
+ else if (AddrWords == 3)
+ AddrWords = 3;
}
if (VAddrWords != AddrWords) {
+ LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
+ << " but got " << VAddrWords << "\n");
ErrInfo = "bad vaddr size";
return false;
}
@@ -4217,7 +4546,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
}
}
-unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
MachineRegisterInfo &MRI) const {
const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
@@ -5002,6 +5331,76 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
Inst.eraseFromParent();
continue;
+
+ // TODO: remove as soon as everything is ready
+ // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
+ // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
+ // can only be selected from the uniform SDNode.
+ case AMDGPU::S_ADD_CO_PSEUDO:
+ case AMDGPU::S_SUB_CO_PSEUDO: {
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+ ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
+ Register CarryInReg = Inst.getOperand(4).getReg();
+ if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
+ Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
+ BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
+ .addReg(CarryInReg);
+ }
+
+ Register CarryOutReg = Inst.getOperand(1).getReg();
+
+ Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
+ MRI.getRegClass(Inst.getOperand(0).getReg())));
+ MachineInstr *CarryOp =
+ BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
+ .addReg(CarryOutReg, RegState::Define)
+ .add(Inst.getOperand(2))
+ .add(Inst.getOperand(3))
+ .addReg(CarryInReg)
+ .addImm(0);
+ legalizeOperands(*CarryOp);
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ continue;
+ case AMDGPU::S_UADDO_PSEUDO:
+ case AMDGPU::S_USUBO_PSEUDO: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineOperand &Dest0 = Inst.getOperand(0);
+ MachineOperand &Dest1 = Inst.getOperand(1);
+ MachineOperand &Src0 = Inst.getOperand(2);
+ MachineOperand &Src1 = Inst.getOperand(3);
+
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+ ? AMDGPU::V_ADD_I32_e64
+ : AMDGPU::V_SUB_I32_e64;
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
+ .addReg(Dest1.getReg(), RegState::Define)
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp bit
+
+ legalizeOperands(*NewInstr, MDT);
+
+ MRI.replaceRegWith(Dest0.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
+ Inst.eraseFromParent();
+ }
+ continue;
+
+ case AMDGPU::S_CSELECT_B32:
+ case AMDGPU::S_CSELECT_B64:
+ lowerSelect(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ continue;
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -5142,6 +5541,78 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
return false;
}
+void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT) const {
+
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ MachineOperand &Cond = Inst.getOperand(3);
+
+ Register SCCSource = Cond.getReg();
+ // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
+ if (!Cond.isUndef()) {
+ for (MachineInstr &CandI :
+ make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
+ Inst.getParent()->rend())) {
+ if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
+ -1) {
+ if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
+ SCCSource = CandI.getOperand(1).getReg();
+ }
+ break;
+ }
+ }
+ }
+
+ // If this is a trivial select where the condition is effectively not SCC
+ // (SCCSource is a source of copy to SCC), then the select is semantically
+ // equivalent to copying SCCSource. Hence, there is no need to create
+ // V_CNDMASK, we can just use that and bail out.
+ if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
+ Src1.isImm() && (Src1.getImm() == 0)) {
+ MRI.replaceRegWith(Dest.getReg(), SCCSource);
+ return;
+ }
+
+ const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
+ ? &AMDGPU::SReg_64_XEXECRegClass
+ : &AMDGPU::SReg_32_XM0_XEXECRegClass;
+ Register CopySCC = MRI.createVirtualRegister(TC);
+
+ if (SCCSource == AMDGPU::SCC) {
+ // Insert a trivial select instead of creating a copy, because a copy from
+ // SCC would semantically mean just copying a single bit, but we may need
+ // the result to be a vector condition mask that needs preserving.
+ unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
+ : AMDGPU::S_CSELECT_B32;
+ auto NewSelect =
+ BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+ NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
+ } else {
+ BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
+ }
+
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ auto UpdatedInst =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
+ .addImm(0)
+ .add(Src1) // False
+ .addImm(0)
+ .add(Src0) // True
+ .addReg(CopySCC);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ legalizeOperands(*UpdatedInst, MDT);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -5623,7 +6094,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
}
void SIInstrInfo::addUsersToMoveToVALUWorklist(
- unsigned DstReg,
+ Register DstReg,
MachineRegisterInfo &MRI,
SetVectorType &Worklist) const {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
@@ -5723,20 +6194,60 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
SetVectorType &Worklist) const {
+ bool SCCUsedImplicitly = false;
+
// Ensure that def inst defines SCC, which is still live.
assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
!Op.isDead() && Op.getParent() == &SCCDefInst);
+ SmallVector<MachineInstr *, 4> CopyToDelete;
// This assumes that all the users of SCC are in the same block
// as the SCC def.
for (MachineInstr &MI : // Skip the def inst itself.
make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
SCCDefInst.getParent()->end())) {
// Check if SCC is used first.
- if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
- Worklist.insert(&MI);
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
+ if (MI.isCopy()) {
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ unsigned DestReg = MI.getOperand(0).getReg();
+
+ for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
+ if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
+ (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
+ User.getOperand(4).setReg(RI.getVCC());
+ Worklist.insert(&User);
+ } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
+ User.getOperand(5).setReg(RI.getVCC());
+ // No need to add to Worklist.
+ }
+ }
+ CopyToDelete.push_back(&MI);
+ } else {
+ if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
+ // This is an implicit use of SCC and it is really expected by
+ // the SCC users to handle.
+ // We cannot preserve the edge to the user so add the explicit
+ // copy: SCC = COPY VCC.
+ // The copy will be cleaned up during the processing of the user
+ // in lowerSelect.
+ SCCUsedImplicitly = true;
+ }
+
+ Worklist.insert(&MI);
+ }
+ }
// Exit if we find another SCC def.
if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
- return;
+ break;
+ }
+ for (auto &Copy : CopyToDelete)
+ Copy->eraseFromParent();
+
+ if (SCCUsedImplicitly) {
+ BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
+ SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(RI.getVCC());
}
}
@@ -5789,7 +6300,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
}
// Find the one SGPR operand we are allowed to use.
-unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
+Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
int OpIndices[3]) const {
const MCInstrDesc &Desc = MI.getDesc();
@@ -5802,11 +6313,11 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
//
// If the operand's class is an SGPR, we can never move it.
- unsigned SGPRReg = findImplicitSGPRRead(MI);
+ Register SGPRReg = findImplicitSGPRRead(MI);
if (SGPRReg != AMDGPU::NoRegister)
return SGPRReg;
- unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+ Register UsedSGPRs[3] = { AMDGPU::NoRegister };
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0; i < 3; ++i) {
@@ -5919,10 +6430,9 @@ bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
return isSMRD(Opc);
}
-bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
- unsigned Opc = MI.getOpcode();
-
- return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+bool SIInstrInfo::isHighLatencyDef(int Opc) const {
+ return get(Opc).mayLoad() &&
+ (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
}
unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
@@ -6198,7 +6708,7 @@ MachineInstrBuilder
SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned DestReg) const {
+ Register DestReg) const {
if (ST.hasAddNoCarry())
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
@@ -6608,20 +7118,24 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
// %0 may even spill. We can't spill $m0 normally (it would require copying to
// a numbered SGPR anyway), and since it is in the SReg_32 register class,
// TargetInstrInfo::foldMemoryOperand() is going to try.
+ // A similar issue also exists with spilling and reloading $exec registers.
//
// To prevent that, constrain the %0 register class here.
if (MI.isFullCopy()) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
-
- if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) {
- MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
- return nullptr;
- }
-
- if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) {
- MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
- return nullptr;
+ if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
+ (DstReg.isVirtual() != SrcReg.isVirtual())) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
+ const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
+ if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
+ MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
+ return nullptr;
+ } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
+ MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
+ return nullptr;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b151a94b0d118..53e2ffba0f656 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -84,6 +84,9 @@ private:
bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
+ void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
+
void lowerScalarAbs(SetVectorType &Worklist,
MachineInstr &Inst) const;
@@ -119,7 +122,7 @@ private:
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
- void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI,
+ void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
SetVectorType &Worklist) const;
void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
@@ -132,7 +135,7 @@ private:
bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
const MachineInstr &MIb) const;
- unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
+ Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
protected:
bool swapSourceModifiers(MachineInstr &MI,
@@ -181,14 +184,15 @@ public:
int64_t &Offset1,
int64_t &Offset2) const override;
- bool getMemOperandWithOffset(const MachineInstr &LdSt,
- const MachineOperand *&BaseOp,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const final;
+ bool getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt,
+ SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+ bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const final;
- bool shouldClusterMemOps(const MachineOperand &BaseOp1,
- const MachineOperand &BaseOp2,
- unsigned NumLoads) const override;
+ bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ ArrayRef<const MachineOperand *> BaseOps2,
+ unsigned NumLoads, unsigned NumBytes) const override;
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
int64_t Offset1, unsigned NumLoads) const override;
@@ -210,22 +214,22 @@ public:
const TargetRegisterClass *getPreferredSelectRegClass(
unsigned Size) const;
- unsigned insertNE(MachineBasicBlock *MBB,
+ Register insertNE(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned SrcReg, int Value) const;
+ Register SrcReg, int Value) const;
- unsigned insertEQ(MachineBasicBlock *MBB,
+ Register insertEQ(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned SrcReg, int Value) const;
+ Register SrcReg, int Value) const;
void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, unsigned SrcReg,
+ MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, unsigned DestReg,
+ MachineBasicBlock::iterator MI, Register DestReg,
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
@@ -244,6 +248,9 @@ public:
// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+ const MCInstrDesc &getIndirectRegWritePseudo(
+ unsigned VecSize, unsigned EltSize, bool IsSGPR) const;
+
LLVM_READONLY
int commuteOpcode(unsigned Opc) const;
@@ -293,20 +300,19 @@ public:
SmallVectorImpl<MachineOperand> &Cond) const override;
bool canInsertSelect(const MachineBasicBlock &MBB,
- ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg,
- int &CondCycles,
+ ArrayRef<MachineOperand> Cond, Register DstReg,
+ Register TrueReg, Register FalseReg, int &CondCycles,
int &TrueCycles, int &FalseCycles) const override;
void insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned DstReg, ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg) const override;
+ Register DstReg, ArrayRef<MachineOperand> Cond,
+ Register TrueReg, Register FalseReg) const override;
void insertVectorSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned DstReg, ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg) const;
+ Register DstReg, ArrayRef<MachineOperand> Cond,
+ Register TrueReg, Register FalseReg) const;
unsigned getAddressSpaceForPseudoSourceKind(
unsigned Kind) const override;
@@ -317,7 +323,7 @@ public:
bool isFoldableCopy(const MachineInstr &MI) const;
- bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const final;
unsigned getMachineCSELookAheadLimit() const override { return 500; }
@@ -685,6 +691,9 @@ public:
return MO.isReg() && RI.isVGPR(MRI, MO.getReg());});
}
+ /// Return true if the instruction modifies the mode register.q
+ static bool modifiesModeRegister(const MachineInstr &MI);
+
/// Whether we must prevent this instruction from executing with EXEC = 0.
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
@@ -824,11 +833,7 @@ public:
const MachineOperand &MO = MI.getOperand(OpNo);
if (MO.isReg()) {
if (unsigned SubReg = MO.getSubReg()) {
- assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg(
- MI.getParent()->getParent()->getRegInfo().
- getRegClass(MO.getReg()), SubReg)) >= 32 &&
- "Sub-dword subregs are not supported");
- return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4;
+ return RI.getSubRegIdxSize(SubReg) / 8;
}
}
return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
@@ -874,7 +879,7 @@ public:
/// be used when it is know that the value in SrcReg is same across all
/// threads in the wave.
/// \returns The SGPR register that \p SrcReg was copied to.
- unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+ Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
MachineRegisterInfo &MRI) const;
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
@@ -928,7 +933,7 @@ public:
uint64_t getScratchRsrcWords23() const;
bool isLowLatencyInstruction(const MachineInstr &MI) const;
- bool isHighLatencyInstruction(const MachineInstr &MI) const;
+ bool isHighLatencyDef(int Opc) const override;
/// Return the descriptor of the target-specific machine instruction
/// that corresponds to the specified pseudo or native opcode.
@@ -995,7 +1000,7 @@ public:
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
- unsigned DestReg) const;
+ Register DestReg) const;
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 85e8d0582dcd1..7aee52f913605 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,11 +7,9 @@
//===----------------------------------------------------------------------===//
def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">,
- AssemblerPredicate <"FeatureWavefrontSize32">;
+ AssemblerPredicate <(all_of FeatureWavefrontSize32)>;
def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">,
- AssemblerPredicate <"FeatureWavefrontSize64">;
-
-def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+ AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
class GCNPredicateControl : PredicateControl {
Predicate SIAssemblerPredicate = isGFX6GFX7;
@@ -30,6 +28,7 @@ def SIEncodingFamily {
int GFX9 = 5;
int GFX10 = 6;
int SDWA10 = 7;
+ int GFX10_B = 8;
}
//===----------------------------------------------------------------------===//
@@ -39,8 +38,7 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
- SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>,
- SDTCisVT<4, i1>]>,
+ SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
[SDNPMayLoad, SDNPMemOperand]
>;
@@ -57,6 +55,10 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
@@ -200,6 +202,7 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
+def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
@@ -267,7 +270,7 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
SDTypeProfile<0 ,1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
>;
//===----------------------------------------------------------------------===//
@@ -308,6 +311,10 @@ class isPackedType<ValueType SrcVT> {
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_global").AddrSpaces in {
+defm atomic_csub_global : binary_atomic_op<SIatomic_csub>;
+}
+
foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
@@ -631,6 +638,16 @@ def add_ctpop : PatFrag <
(add (ctpop $src0), $src1)
>;
+foreach I = 1-4 in {
+def shl#I#_add : PatFrag <
+ (ops node:$src0, node:$src1),
+ (add (shl_oneuse $src0, (i32 I)), $src1)> {
+ // FIXME: Poor substitute for disabling pattern in SelectionDAG
+ let PredicateCode = [{return false;}];
+ let GISelPredicateCode = [{return true;}];
+}
+}
+
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
SDTypeProfile tc = SDTAtomic2,
bit IsInt = 1> {
@@ -651,6 +668,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>;
defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
@@ -665,7 +683,7 @@ defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
-def as_i1imm : SDNodeXForm<imm, [{
+def as_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
}]>;
@@ -673,6 +691,10 @@ def as_i8imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8);
}]>;
+def as_i8timm : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+}]>;
+
def as_i16imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
}]>;
@@ -766,7 +788,7 @@ def NegSubInlineConst32 : ImmLeaf<i32, [{
return Imm < -16 && Imm >= -64;
}], NegateImm>;
-def NegSubInlineConst16 : ImmLeaf<i16, [{
+def NegSubInlineIntConst16 : ImmLeaf<i16, [{
return Imm < -16 && Imm >= -64;
}], NegateImm>;
@@ -791,6 +813,26 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
}], getNegV2I16Imm>;
//===----------------------------------------------------------------------===//
+// MUBUF/SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+def extract_glc : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_slc : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_dlc : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_swz : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
+}]>;
+
+//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
@@ -935,7 +977,7 @@ def VOPDstS64orS32 : BoolRC {
}
// SCSrc_i1 is the operand for pseudo instructions only.
-// Boolean immeadiates shall not be exposed to codegen instructions.
+// Boolean immediates shall not be exposed to codegen instructions.
def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_REG_IMM_INT32";
@@ -1067,6 +1109,7 @@ def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
+def GFX10A16 : NamedOperandBit<"GFX10A16", NamedMatchClass<"GFX10A16">>;
def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
@@ -1099,9 +1142,9 @@ def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
-def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
-def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
}
@@ -1274,19 +1317,14 @@ def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
// VOP3Mods, but the input source is known to never be NaN.
def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
-// VOP3Mods, but only allowed for f32 operands.
-def VOP3Mods_f32 : ComplexPattern<fAny, 2, "SelectVOP3Mods_f32">;
def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
-def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">;
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
-def VOP3OpSel0 : ComplexPattern<untyped, 3, "SelectVOP3OpSel0">;
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
-def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
@@ -1347,6 +1385,7 @@ def HWREG {
int FLAT_SCR_HI = 21;
int XNACK_MASK = 22;
int POPS_PACKER = 25;
+ int SHADER_CYCLES = 29;
}
class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
@@ -1380,24 +1419,21 @@ class SIMCInstr <string pseudo, int subtarget> {
// EXP classes
//===----------------------------------------------------------------------===//
-class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
+class EXP_Helper<bit done> : EXPCommon<
(outs),
(ins exp_tgt:$tgt,
ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
- exp_vm:$vm, exp_compr:$compr, i8imm:$en),
- "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
- [(node (i8 timm:$tgt), (i8 timm:$en),
- f32:$src0, f32:$src1, f32:$src2, f32:$src3,
- (i1 timm:$compr), (i1 timm:$vm))]> {
+ exp_vm:$vm, exp_compr:$compr, i32imm:$en),
+ "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", []> {
let AsmMatchConverter = "cvtExp";
}
// Split EXP instruction into EXP and EXP_DONE so we can set
// mayLoad for done=1.
-multiclass EXP_m<bit done, SDPatternOperator node> {
+multiclass EXP_m<bit done> {
let mayLoad = done, DisableWQM = 1 in {
let isPseudo = 1, isCodeGenOnly = 1 in {
- def "" : EXP_Helper<done, node>,
+ def "" : EXP_Helper<done>,
SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
}
@@ -1685,7 +1721,7 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if (HasClamp,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod:$clamp,
+ clampmod0:$clamp,
op_sel:$op_sel, op_sel_hi:$op_sel_hi,
neg_lo:$neg_lo, neg_hi:$neg_hi),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
@@ -1697,7 +1733,7 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod:$clamp,
+ clampmod0:$clamp,
op_sel:$op_sel, op_sel_hi:$op_sel_hi,
neg_lo:$neg_lo, neg_hi:$neg_hi),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
@@ -1720,7 +1756,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC,
!if (HasClamp,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
- clampmod:$clamp,
+ clampmod0:$clamp,
op_sel:$op_sel),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
@@ -1730,7 +1766,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
Src2Mod:$src2_modifiers, Src2RC:$src2,
- clampmod:$clamp,
+ clampmod0:$clamp,
op_sel:$op_sel),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
@@ -2242,6 +2278,7 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
+def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], 0, /*EnableClamp=*/1>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
@@ -2455,7 +2492,8 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX80)],
[!cast<string>(SIEncodingFamily.GFX9)],
[!cast<string>(SIEncodingFamily.GFX10)],
- [!cast<string>(SIEncodingFamily.SDWA10)]];
+ [!cast<string>(SIEncodingFamily.SDWA10)],
+ [!cast<string>(SIEncodingFamily.GFX10_B)]];
}
// Get equivalent SOPK instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d84720f820ee3..0c4c9e0e9df2b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1,4 +1,4 @@
-//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//===-- SIInstructions.td - SI Instruction Definitions --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -24,8 +24,38 @@ include "BUFInstructions.td"
// EXP Instructions
//===----------------------------------------------------------------------===//
-defm EXP : EXP_m<0, AMDGPUexport>;
-defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
+defm EXP : EXP_m<0>;
+defm EXP_DONE : EXP_m<1>;
+
+class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+ done_val, timm:$vm),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
+>;
+
+class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+ (int_amdgcn_exp_compr timm:$tgt, timm:$en,
+ (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+ done_val, timm:$vm),
+ (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+ (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
+>;
+
+// FIXME: The generated DAG matcher seems to have strange behavior
+// with a 1-bit literal to match, so use a -1 for checking a true
+// 1-bit value.
+def : ExpPattern<i32, EXP, 0>;
+def : ExpPattern<i32, EXP_DONE, -1>;
+def : ExpPattern<f32, EXP, 0>;
+def : ExpPattern<f32, EXP_DONE, -1>;
+
+def : ExpComprPattern<v2i16, EXP, 0>;
+def : ExpComprPattern<v2i16, EXP_DONE, -1>;
+def : ExpComprPattern<v2f16, EXP, 0>;
+def : ExpComprPattern<v2f16, EXP_DONE, -1>;
//===----------------------------------------------------------------------===//
// VINTRP Instructions
@@ -34,9 +64,9 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
def VINTRPDst : VINTRPDstOperand <VGPR_32>;
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
-// FIXME: Specify SchedRW for VINTRP insturctions.
+// FIXME: Specify SchedRW for VINTRP instructions.
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
@@ -76,10 +106,10 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc),
+ [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
-} // End Uses = [M0, EXEC]
+} // End Uses = [MODE, M0, EXEC]
//===----------------------------------------------------------------------===//
// Pseudo Instructions
@@ -136,7 +166,8 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
- let Defs = [EXEC];
+ let Uses = [EXEC];
+ let Defs = [EXEC, SCC];
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 0;
@@ -162,16 +193,27 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
let Constraints = "$src = $vdst";
}
+let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
+def V_ADD_U64_PSEUDO : VPseudoInstSI <
+ (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+ [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))]
+>;
+
+def V_SUB_U64_PSEUDO : VPseudoInstSI <
+ (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+ [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))]
+>;
+} // End usesCustomInserter = 1, Defs = [VCC, EXEC]
let usesCustomInserter = 1, Defs = [SCC] in {
def S_ADD_U64_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
- [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))]
>;
def S_SUB_U64_PSEUDO : SPseudoInstSI <
- (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
- [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+ (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+ [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))]
>;
def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
@@ -181,6 +223,23 @@ def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
(outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
>;
+
+def S_ADD_CO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
+>;
+
+def S_SUB_CO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
+>;
+
+def S_UADDO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
+>;
+
+def S_USUBO_PSEUDO : SPseudoInstSI <
+ (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
+>;
+
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
@@ -199,6 +258,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
let hasSideEffects = base_inst.hasSideEffects;
let UseNamedOperandTable = base_inst.UseNamedOperandTable;
let CodeSize = base_inst.CodeSize;
+ let SchedRW = base_inst.SchedRW;
}
let WaveSizePredicate = isWave64 in {
@@ -214,13 +274,14 @@ def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
}
+
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
[(int_amdgcn_wave_barrier)]> {
let SchedRW = [];
let hasNoSchedulingInfo = 1;
let hasSideEffects = 1;
- let mayLoad = 1;
- let mayStore = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
let isConvergent = 1;
let FixedSize = 1;
let Size = 0;
@@ -318,6 +379,9 @@ multiclass PseudoInstKill <dag ins> {
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
+let Defs = [EXEC] in
+def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
+
let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
@@ -386,7 +450,7 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
def : GCNPat <
(int_amdgcn_init_exec timm:$src),
- (SI_INIT_EXEC_LO (as_i32imm imm:$src))> {
+ (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
let WaveSizePredicate = isWave32;
}
@@ -413,8 +477,8 @@ def SI_RETURN : SPseudoInstSI <
// Return for returning function calls without output register.
//
-// This version is only needed so we can fill in the output regiter in
-// the custom inserter.
+// This version is only needed so we can fill in the output register
+// in the custom inserter.
def SI_CALL_ISEL : SPseudoInstSI <
(outs), (ins SSrc_b64:$src0, unknown:$callee),
[(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
@@ -426,6 +490,11 @@ def SI_CALL_ISEL : SPseudoInstSI <
let isConvergent = 1;
}
+def : GCNPat<
+ (AMDGPUcall i64:$src0, (i64 0)),
+ (SI_CALL_ISEL $src0, (i64 0))
+>;
+
// Wrapper around s_swappc_b64 with extra $callee parameter to track
// the called function after regalloc.
def SI_CALL : SPseudoInstSI <
@@ -480,6 +549,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
let Defs = [M0, EXEC, SCC],
UseNamedOperandTable = 1 in {
+// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect
+// addressing implementation.
class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
(outs VGPR_32:$vdst),
(ins rc:$src, VS_32:$idx, i32imm:$offset)> {
@@ -493,21 +564,81 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
let usesCustomInserter = 1;
}
-// TODO: We can support indirect SGPR access.
def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
} // End Uses = [EXEC], Defs = [M0, EXEC]
+
+// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
+// expecting to be executed with gpr indexing mode enabled)
+// instruction in which the vector operand appears only twice, once as
+// def and once as use. Using this pseudo avoids problems with the Two
+// Address instructions pass.
+class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+ RegisterOperand val_ty> : PseudoInstSI <
+ (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
+ let Constraints = "$vsrc = $vdst";
+ let Uses = [M0];
+}
+
+class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
+ INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
+ let VALU = 1;
+ let VOP1 = 1;
+ let Uses = [M0, EXEC];
+}
+
+class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+ RegisterOperand val_ty> :
+ INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
+ let SALU = 1;
+ let SOP1 = 1;
+ let Uses = [M0];
+}
+
+class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
+class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
+ S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
+
+
+def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
+
+def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
+def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
+def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
+
+def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
+
+
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : PseudoInstSI <
@@ -535,6 +666,7 @@ defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
+defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -574,6 +706,7 @@ defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
+defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -639,12 +772,6 @@ def : GCNPat<
>;
def : Pat <
- // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
- (AMDGPUkill (i32 -1082130432)),
- (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
-def : Pat <
(int_amdgcn_kill i1:$src),
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
>;
@@ -655,11 +782,6 @@ def : Pat <
>;
def : Pat <
- (AMDGPUkill i32:$src),
- (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, 0, 3) // 3 means SETOGE
->;
-
-def : Pat <
(int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
@@ -693,14 +815,14 @@ def : RsqPat<V_RSQ_F64_e32, f64>;
def : GCNPat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
- (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_FRACT_F32_e64 $mods, $x)
>;
// Convert (x + (-floor(x))) to fract(x)
def : GCNPat <
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
- (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_FRACT_F64_e64 $mods, $x)
>;
} // End OtherPredicates = [UnsafeFPMath]
@@ -709,27 +831,27 @@ def : GCNPat <
// f16_to_fp patterns
def : GCNPat <
(f32 (f16_to_fp i32:$src0)),
- (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0)
>;
def : GCNPat <
(f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
- (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0)
>;
def : GCNPat <
(f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
- (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
>;
def : GCNPat <
(f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
- (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0)
>;
def : GCNPat <
(f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
- (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0)
>;
def : GCNPat <
@@ -740,7 +862,7 @@ def : GCNPat <
// fp_to_fp16 patterns
def : GCNPat <
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
- (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0)
>;
def : GCNPat <
@@ -767,20 +889,29 @@ def : GCNPat <
// VOP2 Patterns
//===----------------------------------------------------------------------===//
-multiclass FMADPat <ValueType vt, Instruction inst> {
- def : GCNPat <
- (vt (fmad (VOP3NoMods vt:$src0),
- (VOP3NoMods vt:$src1),
- (VOP3NoMods vt:$src2))),
+// TODO: Check only no src2 mods?
+class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
+ : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
+ (vt (VOP3NoMods vt:$src1)),
+ (vt (VOP3NoMods vt:$src2)))),
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
- >;
+>;
+
+
+// Prefer mac form when there are no modifiers.
+let AddedComplexity = 9 in {
+def : FMADPat <f32, V_MAC_F32_e64, fmad>;
+def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+
+let SubtargetPredicate = Has16BitInsts in {
+def : FMADPat <f16, V_MAC_F16_e64, fmad>;
+def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
}
-defm : FMADPat <f16, V_MAC_F16_e64>;
-defm : FMADPat <f32, V_MAC_F32_e64>;
+}
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
: GCNPat<
(Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
(Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@@ -789,24 +920,28 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
-// FIXME: This should select to V_MAC_F32
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
-def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+let SubtargetPredicate = HasMadMacF32Insts in
+def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
let SubtargetPredicate = Has16BitInsts;
}
-multiclass SelectPat <ValueType vt> {
- def : GCNPat <
- (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods),
- (VOP3Mods_f32 vt:$src2, i32:$src2_mods))),
- (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0)
- >;
-}
+class VOPSelectModsPat <ValueType vt> : GCNPat <
+ (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
+ (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
+ FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
+>;
+
+class VOPSelectPat <ValueType vt> : GCNPat <
+ (vt (select i1:$src0, vt:$src1, vt:$src2)),
+ (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
+>;
-defm : SelectPat <i16>;
-defm : SelectPat <i32>;
-defm : SelectPat <f16>;
-defm : SelectPat <f32>;
+def : VOPSelectModsPat <i32>;
+def : VOPSelectModsPat <f32>;
+def : VOPSelectPat <f16>;
+def : VOPSelectPat <i16>;
let AddedComplexity = 1 in {
def : GCNPat <
@@ -1039,6 +1174,8 @@ def : BitConvert <v4f32, v2f64, VReg_128>;
def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
+def : BitConvert <v4f32, v2i64, VReg_128>;
+def : BitConvert <v2i64, v4f32, VReg_128>;
// 160-bit bitcast
def : BitConvert <v5i32, v5f32, SGPR_160>;
@@ -1049,14 +1186,46 @@ def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, VReg_256>;
def : BitConvert <v8f32, v8i32, VReg_256>;
+def : BitConvert <v4i64, v4f64, VReg_256>;
+def : BitConvert <v4f64, v4i64, VReg_256>;
+def : BitConvert <v4i64, v8i32, VReg_256>;
+def : BitConvert <v4i64, v8f32, VReg_256>;
+def : BitConvert <v4f64, v8i32, VReg_256>;
+def : BitConvert <v4f64, v8f32, VReg_256>;
+def : BitConvert <v8i32, v4i64, VReg_256>;
+def : BitConvert <v8f32, v4i64, VReg_256>;
+def : BitConvert <v8i32, v4f64, VReg_256>;
+def : BitConvert <v8f32, v4f64, VReg_256>;
+
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
+def : BitConvert <v8i64, v8f64, VReg_512>;
+def : BitConvert <v8f64, v8i64, VReg_512>;
+def : BitConvert <v8i64, v16i32, VReg_512>;
+def : BitConvert <v8f64, v16i32, VReg_512>;
+def : BitConvert <v16i32, v8i64, VReg_512>;
+def : BitConvert <v16i32, v8f64, VReg_512>;
+def : BitConvert <v8i64, v16f32, VReg_512>;
+def : BitConvert <v8f64, v16f32, VReg_512>;
+def : BitConvert <v16f32, v8i64, VReg_512>;
+def : BitConvert <v16f32, v8f64, VReg_512>;
// 1024-bit bitcast
def : BitConvert <v32i32, v32f32, VReg_1024>;
def : BitConvert <v32f32, v32i32, VReg_1024>;
+def : BitConvert <v16i64, v16f64, VReg_1024>;
+def : BitConvert <v16f64, v16i64, VReg_1024>;
+def : BitConvert <v16i64, v32i32, VReg_1024>;
+def : BitConvert <v32i32, v16i64, VReg_1024>;
+def : BitConvert <v16f64, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v16f64, VReg_1024>;
+def : BitConvert <v16i64, v32f32, VReg_1024>;
+def : BitConvert <v32i32, v16f64, VReg_1024>;
+def : BitConvert <v16f64, v32i32, VReg_1024>;
+def : BitConvert <v32f32, v16i64, VReg_1024>;
+
/********** =================== **********/
/********** Src & Dst modifiers **********/
@@ -1155,7 +1324,7 @@ def : GCNPat <
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
-// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled
+// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled
// def : GCNPat <
// (fneg (f64 SReg_64:$src)),
// (REG_SEQUENCE SReg_64,
@@ -1176,6 +1345,17 @@ def : GCNPat <
// sub1)
// >;
+// FIXME: Use S_BITSET0_B32/B64?
+// def : GCNPat <
+// (fabs (f64 SReg_64:$src)),
+// (REG_SEQUENCE SReg_64,
+// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+// sub0,
+// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+// (i32 (S_MOV_B32 (i32 0x7fffffff)))),
+// sub1)
+// >;
+
} // End let AddedComplexity = 1
def : GCNPat <
@@ -1372,11 +1552,12 @@ class Ext32Pat <SDNode ext> : GCNPat <
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
-// The multiplication scales from [0,1] to the unsigned integer range
+// The multiplication scales from [0,1) to the unsigned integer range,
+// rounding down a bit to avoid unwanted overflow.
def : GCNPat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
- (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
+ (V_MUL_F32_e32 (i32 CONST.FP_4294966784),
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
@@ -1421,11 +1602,13 @@ defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
+defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
+defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
//===----------------------------------------------------------------------===//
// SAD Patterns
@@ -1695,102 +1878,187 @@ def : GCNPat <
def : GCNPat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
- (V_ALIGNBIT_B32 $a, $a, (i32 24)),
- (V_ALIGNBIT_B32 $a, $a, (i32 8)))
+ (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+ (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
>;
-let OtherPredicates = [NoFP16Denormals] in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+// FIXME: This should have been narrowed to i32 during legalization.
+// This pattern should also be skipped for GlobalISel
+def : GCNPat <
+ (i64 (bswap i64:$a)),
+ (REG_SEQUENCE VReg_64,
+ (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 24)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+ (i32 8))),
+ sub0,
+ (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 24)),
+ (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+ (i32 8))),
+ sub1)
+>;
+
+// FIXME: The AddedComplexity should not be needed, but in GlobalISel
+// the BFI pattern ends up taking precedence without it.
+let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
+// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
+//
+// My reading of the manual suggests we should be using src0 for the
+// register value, but this is what seems to work.
+def : GCNPat <
+ (i32 (bswap i32:$a)),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
>;
-def : GCNPat<
- (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0)
+// FIXME: This should have been narrowed to i32 during legalization.
+// This pattern should also be skipped for GlobalISel
+def : GCNPat <
+ (i64 (bswap i64:$a)),
+ (REG_SEQUENCE VReg_64,
+ (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
+ (S_MOV_B32 (i32 0x00010203))),
+ sub0,
+ (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
+ (S_MOV_B32 (i32 0x00010203))),
+ sub1)
>;
-def : GCNPat<
- (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
+// The 12s emit 0s.
+def : GCNPat <
+ (i16 (bswap i16:$a)),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
-}
-let OtherPredicates = [FP16Denormals] in {
-def : GCNPat<
- (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+def : GCNPat <
+ (i32 (zext (bswap i16:$a))),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
-let SubtargetPredicate = HasVOP3PInsts in {
-def : GCNPat<
- (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
+// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
+def : GCNPat <
+ (v2i16 (bswap v2i16:$a)),
+ (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
>;
+
}
-}
-let OtherPredicates = [NoFP32Denormals] in {
+
+// Prefer selecting to max when legal, but using mul is always valid.
+let AddedComplexity = -5 in {
def : GCNPat<
- (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
- (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
+ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
>;
def : GCNPat<
- (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
- (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0)
+ (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
+ (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
+>;
+
+def : GCNPat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
-}
-let OtherPredicates = [FP32Denormals] in {
def : GCNPat<
(fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
- (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src)
>;
-}
-let OtherPredicates = [NoFP64Denormals] in {
def : GCNPat<
- (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
+ (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
+ (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
>;
-}
-let OtherPredicates = [FP64Denormals] in {
+// TODO: Handle fneg like other types.
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
- (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0)
+ (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
>;
+} // End AddedComplexity = -5
+
+multiclass SelectCanonicalizeAsMax<
+ list<Predicate> f32_preds = [],
+ list<Predicate> f64_preds = [],
+ list<Predicate> f16_preds = []> {
+ def : GCNPat<
+ (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+ (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = f32_preds;
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+ (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = f64_preds;
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+ (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
+ // FIXME: Should have 16-bit inst subtarget predicate
+ let OtherPredicates = f16_preds;
+ }
+
+ def : GCNPat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> {
+ // FIXME: Should have VOP3P subtarget predicate
+ let OtherPredicates = f16_preds;
+ }
}
+// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal
+// mode, and would never flush. For f64, it's faster to do implement
+// this with a max. For f16/f32 it's a wash, but prefer max when
+// valid.
+//
+// FIXME: Lowering f32/f16 with max is worse since we can use a
+// smaller encoding if the input is fneg'd. It also adds an extra
+// register use.
+let SubtargetPredicate = HasMinMaxDenormModes in {
+ defm : SelectCanonicalizeAsMax<[], [], []>;
+} // End SubtargetPredicate = HasMinMaxDenormModes
+
+let SubtargetPredicate = NotHasMinMaxDenormModes in {
+ // Use the max lowering if we don't need to flush.
+
+ // FIXME: We don't do use this for f32 as a workaround for the
+ // library being compiled with the default ieee mode, but
+ // potentially being called from flushing kernels. Really we should
+ // not be mixing code expecting different default FP modes, but mul
+ // works in any FP environment.
+ defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>;
+} // End SubtargetPredicate = NotHasMinMaxDenormModes
+
+
let OtherPredicates = [HasDLInsts] in {
def : GCNPat <
- (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
(f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(f32 (VOP3NoMods f32:$src2))),
(V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
- SRCMODS.NONE, $src2, $clamp, $omod)
+ SRCMODS.NONE, $src2)
>;
} // End OtherPredicates = [HasDLInsts]
let SubtargetPredicate = isGFX10Plus in
def : GCNPat <
- (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
(f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
- SRCMODS.NONE, $src2, $clamp, $omod)
->;
-
-// Allow integer inputs
-class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
- (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
- (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
+ SRCMODS.NONE, $src2)
>;
-def : ExpPattern<AMDGPUexport, i32, EXP>;
-def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-
// COPY is workaround tablegen bug from multiple outputs
// from S_LSHL_B32's multiple outputs from implicit scc def.
def : GCNPat <
@@ -1873,19 +2141,20 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
- timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl))
+ (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
+ (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
+ (as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
+ (as_i32timm $bank_mask),
+ (as_i1timm $bound_ctrl))
>;
def : GCNPat <
(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl))
+ (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
+ (as_i32timm $row_mask), (as_i32timm $bank_mask),
+ (as_i1timm $bound_ctrl))
>;
//===----------------------------------------------------------------------===//
@@ -1901,6 +2170,11 @@ let SubtargetPredicate = isGFX6 in {
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
// Convert floor(x) to (x - fract(x))
+
+// Don't bother handling this for GlobalISel, it's handled during
+// lowering.
+//
+// FIXME: DAG should also custom lower this.
def : GCNPat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
(V_ADD_F64
@@ -1910,13 +2184,11 @@ def : GCNPat <
(V_CNDMASK_B64_PSEUDO
(V_MIN_F64
SRCMODS.NONE,
- (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+ (V_FRACT_F64_e64 $mods, $x),
SRCMODS.NONE,
- (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
- DSTCLAMP.NONE, DSTOMOD.NONE),
+ (V_MOV_B64_PSEUDO 0x3fefffffffffffff)),
$x,
- (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
- DSTCLAMP.NONE, DSTOMOD.NONE)
+ (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
>;
} // End SubtargetPredicates = isGFX6
@@ -2061,13 +2333,164 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = 0;
+}
+
+class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+class TBufferLoadGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+}
+
+def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
+def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
+def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
+
+class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
+class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$format,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
+
+def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
+
+def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1);
+ let hasSideEffects = 0;
+}
+
+foreach N = 0-3 in {
+def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0);
+ let hasSideEffects = 0;
+}
+}
+
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.
def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$oldval);
- let InOperandList = (ins ptype1:$addr, type0:$cmpval_nnenwval);
+ let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+let Namespace = "AMDGPU" in {
+def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
+}
+
+class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+ type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
let hasSideEffects = 0;
let mayLoad = 1;
let mayStore = 1;
}
+
+def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+
+def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
+ type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
+ untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
+// a workaround for the intrinsic being defined as readnone, but
+// really needs a memory operand.
+def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+// This is equivalent to the G_INTRINSIC*, but the operands may have
+// been legalized depending on the subtarget requirements.
+def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+
+ // FIXME: Use separate opcode for atomics.
+ let mayStore = 1;
+}
+
+// This is equivalent to the G_INTRINSIC*, but the operands may have
+// been legalized depending on the subtarget requirements.
+def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins unknown:$intrin, variable_ops);
+ let hasSideEffects = 0;
+ let mayStore = 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index d2b1abc8a9fb8..2eb1c52f1b595 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -103,15 +103,19 @@ enum InstClassEnum {
TBUFFER_STORE,
};
-enum RegisterEnum {
- SBASE = 0x1,
- SRSRC = 0x2,
- SOFFSET = 0x4,
- VADDR = 0x8,
- ADDR = 0x10,
- SSAMP = 0x20,
+struct AddressRegs {
+ unsigned char NumVAddrs = 0;
+ bool SBase = false;
+ bool SRsrc = false;
+ bool SOffset = false;
+ bool VAddr = false;
+ bool Addr = false;
+ bool SSamp = false;
};
+// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
+const unsigned MaxAddressRegs = 12 + 1 + 1;
+
class SILoadStoreOptimizer : public MachineFunctionPass {
struct CombineInfo {
MachineBasicBlock::iterator I;
@@ -126,10 +130,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool SLC;
bool DLC;
bool UseST64;
- SmallVector<MachineInstr *, 8> InstsToMove;
- int AddrIdx[5];
- const MachineOperand *AddrReg[5];
+ int AddrIdx[MaxAddressRegs];
+ const MachineOperand *AddrReg[MaxAddressRegs];
unsigned NumAddresses;
+ unsigned Order;
bool hasSameBaseAddress(const MachineInstr &MI) {
for (unsigned i = 0; i < NumAddresses; i++) {
@@ -183,8 +187,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
};
struct BaseRegisters {
- unsigned LoReg = 0;
- unsigned HiReg = 0;
+ Register LoReg;
+ Register HiReg;
unsigned LoSubReg = 0;
unsigned HiSubReg = 0;
@@ -201,7 +205,6 @@ private:
const GCNSubtarget *STM = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
- const MCSubtargetInfo *STI = nullptr;
MachineRegisterInfo *MRI = nullptr;
AliasAnalysis *AA = nullptr;
bool OptimizeAgain;
@@ -209,9 +212,9 @@ private:
static bool dmasksCanBeCombined(const CombineInfo &CI,
const SIInstrInfo &TII,
const CombineInfo &Paired);
- static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI,
- CombineInfo &Paired);
- static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI,
+ static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
+ CombineInfo &Paired, bool Modify = false);
+ static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
const CombineInfo &Paired);
static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
@@ -219,25 +222,42 @@ private:
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
const CombineInfo &Paired);
- bool findMatchingInst(CombineInfo &CI, CombineInfo &Paired);
+ bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
+ SmallVectorImpl<MachineInstr *> &InstsToMove);
unsigned read2Opcode(unsigned EltSize) const;
unsigned read2ST64Opcode(unsigned EltSize) const;
- MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired);
+ MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
+ CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
- MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired);
- MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired);
- MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired);
- MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
- MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
- MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
- MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
-
- void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
+ MachineBasicBlock::iterator
+ mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+ MachineBasicBlock::iterator
+ mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove);
+
+ void updateBaseAndOffset(MachineInstr &I, Register NewBase,
int32_t NewOffset) const;
- unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
+ Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
@@ -249,8 +269,11 @@ private:
SmallPtrSet<MachineInstr *, 4> &Promoted) const;
void addInstToMergeableList(const CombineInfo &CI,
std::list<std::list<CombineInfo> > &MergeableInsts) const;
- bool collectMergeableInsts(MachineBasicBlock &MBB,
- std::list<std::list<CombineInfo> > &MergeableInsts) const;
+
+ std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
+ MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+ MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+ std::list<std::list<CombineInfo>> &MergeableInsts) const;
public:
static char ID;
@@ -259,8 +282,6 @@ public:
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
}
- void removeCombinedInst(std::list<CombineInfo> &MergeList,
- const MachineInstr &MI);
bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
bool &OptimizeListAgain);
bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
@@ -275,6 +296,11 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties()
+ .set(MachineFunctionProperties::Property::IsSSA);
+ }
};
static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
@@ -327,7 +353,8 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
}
if (TII.isMIMG(Opc)) {
// Ignore instructions encoded without vaddr.
- if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
return UNKNOWN;
// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
@@ -400,58 +427,54 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
}
}
-static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
- if (TII.isMUBUF(Opc)) {
- unsigned result = 0;
+static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
+ AddressRegs Result;
- if (AMDGPU::getMUBUFHasVAddr(Opc)) {
- result |= VADDR;
- }
-
- if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
- result |= SRSRC;
- }
-
- if (AMDGPU::getMUBUFHasSoffset(Opc)) {
- result |= SOFFSET;
- }
-
- return result;
+ if (TII.isMUBUF(Opc)) {
+ if (AMDGPU::getMUBUFHasVAddr(Opc))
+ Result.VAddr = true;
+ if (AMDGPU::getMUBUFHasSrsrc(Opc))
+ Result.SRsrc = true;
+ if (AMDGPU::getMUBUFHasSoffset(Opc))
+ Result.SOffset = true;
+
+ return Result;
}
if (TII.isMIMG(Opc)) {
- unsigned result = VADDR | SRSRC;
+ int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+ if (VAddr0Idx >= 0) {
+ int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
+ } else {
+ Result.VAddr = true;
+ }
+ Result.SRsrc = true;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
- result |= SSAMP;
+ Result.SSamp = true;
- return result;
+ return Result;
}
if (TII.isMTBUF(Opc)) {
- unsigned result = 0;
-
- if (AMDGPU::getMTBUFHasVAddr(Opc)) {
- result |= VADDR;
- }
-
- if (AMDGPU::getMTBUFHasSrsrc(Opc)) {
- result |= SRSRC;
- }
-
- if (AMDGPU::getMTBUFHasSoffset(Opc)) {
- result |= SOFFSET;
- }
-
- return result;
+ if (AMDGPU::getMTBUFHasVAddr(Opc))
+ Result.VAddr = true;
+ if (AMDGPU::getMTBUFHasSrsrc(Opc))
+ Result.SRsrc = true;
+ if (AMDGPU::getMTBUFHasSoffset(Opc))
+ Result.SOffset = true;
+
+ return Result;
}
switch (Opc) {
default:
- return 0;
+ return Result;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return SBASE;
+ Result.SBase = true;
+ return Result;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B64:
case AMDGPU::DS_READ_B32_gfx9:
@@ -460,7 +483,8 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B32_gfx9:
case AMDGPU::DS_WRITE_B64_gfx9:
- return ADDR;
+ Result.Addr = true;
+ return Result;
}
}
@@ -486,7 +510,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
: 4;
break;
case S_BUFFER_LOAD_IMM:
- EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
+ EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
break;
default:
EltSize = 4;
@@ -495,6 +519,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if (InstClass == MIMG) {
DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+ // Offset is not considered for MIMG instructions.
+ Offset = 0;
} else {
int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
Offset = I->getOperand(OffsetIdx).getImm();
@@ -515,40 +541,34 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
}
- unsigned AddrOpName[5] = {0};
- NumAddresses = 0;
- const unsigned Regs = getRegs(I->getOpcode(), TII);
-
- if (Regs & ADDR) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
- }
+ AddressRegs Regs = getRegs(Opc, TII);
- if (Regs & SBASE) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
- }
-
- if (Regs & SRSRC) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
- }
-
- if (Regs & SOFFSET) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- }
-
- if (Regs & VADDR) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
- }
-
- if (Regs & SSAMP) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
- }
-
- for (unsigned i = 0; i < NumAddresses; i++) {
- AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
- AddrReg[i] = &I->getOperand(AddrIdx[i]);
- }
-
- InstsToMove.clear();
+ NumAddresses = 0;
+ for (unsigned J = 0; J < Regs.NumVAddrs; J++)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
+ if (Regs.Addr)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
+ if (Regs.SBase)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
+ if (Regs.SRsrc)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ if (Regs.SOffset)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+ if (Regs.VAddr)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+ if (Regs.SSamp)
+ AddrIdx[NumAddresses++] =
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
+ assert(NumAddresses <= MaxAddressRegs);
+
+ for (unsigned J = 0; J < NumAddresses; J++)
+ AddrReg[J] = &I->getOperand(AddrIdx[J]);
}
} // end anonymous namespace.
@@ -578,8 +598,8 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
}
static void addDefsUsesToList(const MachineInstr &MI,
- DenseSet<unsigned> &RegDefs,
- DenseSet<unsigned> &PhysRegUses) {
+ DenseSet<Register> &RegDefs,
+ DenseSet<Register> &PhysRegUses) {
for (const MachineOperand &Op : MI.operands()) {
if (Op.isReg()) {
if (Op.isDef())
@@ -601,8 +621,8 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
// Add MI and its defs to the lists if MI reads one of the defs that are
// already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
- DenseSet<unsigned> &PhysRegUses,
+static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
+ DenseSet<Register> &PhysRegUses,
SmallVectorImpl<MachineInstr *> &Insts) {
for (MachineOperand &Use : MI.operands()) {
// If one of the defs is read, then there is a use of Def between I and the
@@ -671,7 +691,8 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
// Check other optional immediate operands for equality.
unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
- AMDGPU::OpName::da, AMDGPU::OpName::r128};
+ AMDGPU::OpName::da, AMDGPU::OpName::r128,
+ AMDGPU::OpName::a16, AMDGPU::OpName::dlc};
for (auto op : OperandsToMatch) {
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -695,7 +716,7 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
unsigned ComponentCount,
- const MCSubtargetInfo &STI) {
+ const GCNSubtarget &STI) {
if (ComponentCount > 4)
return 0;
@@ -719,8 +740,9 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
}
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
- const MCSubtargetInfo &STI,
- CombineInfo &Paired) {
+ const GCNSubtarget &STI,
+ CombineInfo &Paired,
+ bool Modify) {
assert(CI.InstClass != MIMG);
// XXX - Would the same offset be OK? Is there any reason this would happen or
@@ -761,7 +783,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
CI.UseST64 = false;
CI.BaseOff = 0;
- // Handle SMEM and VMEM instructions.
+ // Handle DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
@@ -769,20 +791,25 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
(CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
}
+ // Handle SMEM and VMEM instructions.
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
- CI.Offset = EltOffset0 / 64;
- Paired.Offset = EltOffset1 / 64;
- CI.UseST64 = true;
+ if (Modify) {
+ CI.Offset = EltOffset0 / 64;
+ Paired.Offset = EltOffset1 / 64;
+ CI.UseST64 = true;
+ }
return true;
}
// Check if the new offsets fit in the reduced 8-bit range.
if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
- CI.Offset = EltOffset0;
- Paired.Offset = EltOffset1;
+ if (Modify) {
+ CI.Offset = EltOffset0;
+ Paired.Offset = EltOffset1;
+ }
return true;
}
@@ -791,15 +818,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
CI.BaseOff = std::min(CI.Offset, Paired.Offset);
if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
- CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
- Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
- CI.UseST64 = true;
+ if (Modify) {
+ CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+ Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+ CI.UseST64 = true;
+ }
return true;
}
if (isUInt<8>(OffsetDiff)) {
- CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
- Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+ if (Modify) {
+ CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
+ Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+ }
return true;
}
@@ -824,11 +855,19 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}
-bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
- CombineInfo &Paired) {
- MachineBasicBlock *MBB = CI.I->getParent();
- MachineBasicBlock::iterator E = MBB->end();
- MachineBasicBlock::iterator MBBI = CI.I;
+/// This function assumes that CI comes before Paired in a basic block.
+bool SILoadStoreOptimizer::checkAndPrepareMerge(
+ CombineInfo &CI, CombineInfo &Paired,
+ SmallVectorImpl<MachineInstr *> &InstsToMove) {
+
+ // Check both offsets (or masks for MIMG) can be combined and fit in the
+ // reduced range.
+ if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
+ return false;
+
+ if (CI.InstClass != MIMG &&
+ (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
+ return false;
const unsigned Opc = CI.I->getOpcode();
const InstClassEnum InstClass = getInstClass(Opc, *TII);
@@ -844,14 +883,25 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
return false;
- ++MBBI;
-
- DenseSet<unsigned> RegDefsToMove;
- DenseSet<unsigned> PhysRegUsesToMove;
+ DenseSet<Register> RegDefsToMove;
+ DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
+ MachineBasicBlock::iterator E = std::next(Paired.I);
+ MachineBasicBlock::iterator MBBI = std::next(CI.I);
+ MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
for (; MBBI != E; ++MBBI) {
+ if (MBBI == MBBE) {
+ // CombineInfo::Order is a hint on the instruction ordering within the
+ // basic block. This hint suggests that CI precedes Paired, which is
+ // true most of the time. However, moveInstsAfter() processing a
+ // previous list may have changed this order in a situation when it
+ // moves an instruction which exists in some other merge list.
+ // In this case it must be dependent.
+ return false;
+ }
+
if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
(getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
// This is not a matching instruction, but we can keep looking as
@@ -868,11 +918,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
if (MBBI->mayLoadOrStore() &&
(!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
+ !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
- CI.InstsToMove.push_back(&*MBBI);
+ InstsToMove.push_back(&*MBBI);
addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
continue;
}
@@ -881,7 +931,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
// to the location of the matched instruction any uses of I will need to
// be moved down as well.
addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
- CI.InstsToMove);
+ InstsToMove);
continue;
}
@@ -901,26 +951,24 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
// where the DS_READ_B32 ends up in InstsToMove and therefore prevents
// merging of the two writes.
if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
- CI.InstsToMove))
+ InstsToMove))
continue;
- bool Match = CI.hasSameBaseAddress(*MBBI);
-
- if (Match) {
- Paired.setMI(MBBI, *TII, *STM);
-
- // Check both offsets (or masks for MIMG) can be combined and fit in the
- // reduced range.
- bool canBeCombined =
- CI.InstClass == MIMG
- ? dmasksCanBeCombined(CI, *TII, Paired)
- : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired);
-
- // We also need to go through the list of instructions that we plan to
+ if (&*MBBI == &*Paired.I) {
+ // We need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
- if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+ if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
+
+ // Call offsetsCanBeCombined with modify = true so that the offsets are
+ // correct for the new instruction. This should return true, because
+ // this function should only be called on CombineInfo objects that
+ // have already been confirmed to be mergeable.
+ if (CI.InstClass != MIMG)
+ offsetsCanBeCombined(CI, *STM, Paired, true);
return true;
+ }
+ return false;
}
// We've found a load/store that we couldn't merge for some reason.
@@ -929,7 +977,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
// down past this instruction.
// check if we can move I across MBBI and if we can move all I's users
if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+ !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
break;
}
return false;
@@ -950,7 +998,8 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -1023,7 +1072,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- moveInstsAfter(Copy1, CI.InstsToMove);
+ moveInstsAfter(Copy1, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1049,7 +1098,8 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1106,7 +1156,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
.addImm(0) // gds
.cloneMergedMemRefs({&*CI.I, &*Paired.I});
- moveInstsAfter(Write2, CI.InstsToMove);
+ moveInstsAfter(Write2, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1116,7 +1166,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
}
MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1161,15 +1212,16 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- moveInstsAfter(Copy1, CI.InstsToMove);
+ moveInstsAfter(Copy1, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+ CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1211,15 +1263,16 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Pair
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- moveInstsAfter(Copy1, CI.InstsToMove);
+ moveInstsAfter(Copy1, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
+ CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1233,9 +1286,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
- const unsigned Regs = getRegs(Opcode, *TII);
+ AddressRegs Regs = getRegs(Opcode, *TII);
- if (Regs & VADDR)
+ if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
// It shouldn't be possible to get this far if the two instructions
@@ -1273,15 +1326,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- moveInstsAfter(Copy1, CI.InstsToMove);
+ moveInstsAfter(Copy1, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
+ CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1295,13 +1349,13 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
- const unsigned Regs = getRegs(Opcode, *TII);
+ AddressRegs Regs = getRegs(Opcode, *TII);
- if (Regs & VADDR)
+ if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
unsigned JoinedFormat =
- getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI);
+ getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
@@ -1340,15 +1394,16 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
.add(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- moveInstsAfter(Copy1, CI.InstsToMove);
+ moveInstsAfter(Copy1, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
+ CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1374,13 +1429,13 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired
auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
- const unsigned Regs = getRegs(Opcode, *TII);
+ AddressRegs Regs = getRegs(Opcode, *TII);
- if (Regs & VADDR)
+ if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
unsigned JoinedFormat =
- getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI);
+ getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
@@ -1403,7 +1458,7 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
- moveInstsAfter(MIB, CI.InstsToMove);
+ moveInstsAfter(MIB, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1491,9 +1546,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
case 4:
return &AMDGPU::SGPR_128RegClass;
case 8:
- return &AMDGPU::SReg_256RegClass;
+ return &AMDGPU::SGPR_256RegClass;
case 16:
- return &AMDGPU::SReg_512RegClass;
+ return &AMDGPU::SGPR_512RegClass;
}
} else {
switch (CI.Width + Paired.Width) {
@@ -1509,8 +1564,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
}
}
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
+ CombineInfo &CI, CombineInfo &Paired,
+ const SmallVectorImpl<MachineInstr *> &InstsToMove) {
MachineBasicBlock *MBB = CI.I->getParent();
DebugLoc DL = CI.I->getDebugLoc();
@@ -1536,9 +1592,9 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired)
auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
- const unsigned Regs = getRegs(Opcode, *TII);
+ AddressRegs Regs = getRegs(Opcode, *TII);
- if (Regs & VADDR)
+ if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
@@ -1561,7 +1617,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired)
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
- moveInstsAfter(MIB, CI.InstsToMove);
+ moveInstsAfter(MIB, InstsToMove);
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1585,7 +1641,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
}
// Compute base address using Addr and return the final register.
-unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
+Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
const MemAddress &Addr) const {
MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
@@ -1644,7 +1700,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
// Update base and offset with the NewBase and NewOffset in MI.
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
- unsigned NewBase,
+ Register NewBase,
int32_t NewOffset) const {
auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
Base->setReg(NewBase);
@@ -1856,7 +1912,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
<< AnchorAddr.Offset << "\n\n");
// Instead of moving up, just re-compute anchor-instruction's base address.
- unsigned Base = computeBase(MI, AnchorAddr);
+ Register Base = computeBase(MI, AnchorAddr);
updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
@@ -1894,39 +1950,80 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
MergeableInsts.emplace_back(1, CI);
}
-bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
- std::list<std::list<CombineInfo> > &MergeableInsts) const {
+std::pair<MachineBasicBlock::iterator, bool>
+SILoadStoreOptimizer::collectMergeableInsts(
+ MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+ MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+ std::list<std::list<CombineInfo>> &MergeableInsts) const {
bool Modified = false;
- // Contain the list
- MemInfoMap Visited;
- // Contains the list of instructions for which constant offsets are being
- // promoted to the IMM.
- SmallPtrSet<MachineInstr *, 4> AnchorList;
// Sort potential mergeable instructions into lists. One list per base address.
- for (MachineInstr &MI : MBB.instrs()) {
+ unsigned Order = 0;
+ MachineBasicBlock::iterator BlockI = Begin;
+ for (; BlockI != End; ++BlockI) {
+ MachineInstr &MI = *BlockI;
+
// We run this before checking if an address is mergeable, because it can produce
// better code even if the instructions aren't mergeable.
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true;
+ // Don't combine if volatile. We also won't be able to merge across this, so
+ // break the search. We can look after this barrier for separate merges.
+ if (MI.hasOrderedMemoryRef()) {
+ LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
+
+ // Search will resume after this instruction in a separate merge list.
+ ++BlockI;
+ break;
+ }
+
const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
if (InstClass == UNKNOWN)
continue;
- // Don't combine if volatile.
- if (MI.hasOrderedMemoryRef())
- continue;
-
CombineInfo CI;
CI.setMI(MI, *TII, *STM);
+ CI.Order = Order++;
if (!CI.hasMergeableAddress(*MRI))
continue;
+ LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
+
addInstToMergeableList(CI, MergeableInsts);
}
- return Modified;
+
+ // At this point we have lists of Mergeable instructions.
+ //
+ // Part 2: Sort lists by offset and then for each CombineInfo object in the
+ // list try to find an instruction that can be merged with I. If an instruction
+ // is found, it is stored in the Paired field. If no instructions are found, then
+ // the CombineInfo object is deleted from the list.
+
+ for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
+ E = MergeableInsts.end(); I != E;) {
+
+ std::list<CombineInfo> &MergeList = *I;
+ if (MergeList.size() <= 1) {
+ // This means we have found only one instruction with a given address
+ // that can be merged, and we need at least 2 instructions to do a merge,
+ // so this list can be discarded.
+ I = MergeableInsts.erase(I);
+ continue;
+ }
+
+ // Sort the lists by offsets, this way mergeable instructions will be
+ // adjacent to each other in the list, which will make it easier to find
+ // matches.
+ MergeList.sort(
+ [] (const CombineInfo &A, CombineInfo &B) {
+ return A.Offset < B.Offset;
+ });
+ ++I;
+ }
+
+ return std::make_pair(BlockI, Modified);
}
// Scan through looking for adjacent LDS operations with constant offsets from
@@ -1936,117 +2033,126 @@ bool SILoadStoreOptimizer::optimizeBlock(
std::list<std::list<CombineInfo> > &MergeableInsts) {
bool Modified = false;
- for (std::list<CombineInfo> &MergeList : MergeableInsts) {
- if (MergeList.size() < 2)
- continue;
+ for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
+ E = MergeableInsts.end(); I != E;) {
+ std::list<CombineInfo> &MergeList = *I;
bool OptimizeListAgain = false;
if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
- // We weren't able to make any changes, so clear the list so we don't
+ // We weren't able to make any changes, so delete the list so we don't
// process the same instructions the next time we try to optimize this
// block.
- MergeList.clear();
+ I = MergeableInsts.erase(I);
continue;
}
- // We made changes, but also determined that there were no more optimization
- // opportunities, so we don't need to reprocess the list
- if (!OptimizeListAgain)
- MergeList.clear();
-
- OptimizeAgain |= OptimizeListAgain;
Modified = true;
- }
- return Modified;
-}
-void
-SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
- const MachineInstr &MI) {
-
- for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
- if (&*CI->I == &MI) {
- MergeList.erase(CI);
- return;
+ // We made changes, but also determined that there were no more optimization
+ // opportunities, so we don't need to reprocess the list
+ if (!OptimizeListAgain) {
+ I = MergeableInsts.erase(I);
+ continue;
}
+ OptimizeAgain = true;
}
+ return Modified;
}
bool
SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
std::list<CombineInfo> &MergeList,
bool &OptimizeListAgain) {
+ if (MergeList.empty())
+ return false;
+
bool Modified = false;
- for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
- CombineInfo &CI = *I;
- CombineInfo Paired;
- if (CI.InstClass == UNKNOWN)
- continue;
+ for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
+ Next = std::next(I)) {
+
+ auto First = I;
+ auto Second = Next;
+
+ if ((*First).Order > (*Second).Order)
+ std::swap(First, Second);
+ CombineInfo &CI = *First;
+ CombineInfo &Paired = *Second;
- if (!findMatchingInst(CI, Paired))
- goto done;
+ SmallVector<MachineInstr *, 8> InstsToMove;
+ if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+ ++I;
+ continue;
+ }
Modified = true;
- removeCombinedInst(MergeList, *Paired.I);
+
+ LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
switch (CI.InstClass) {
default:
llvm_unreachable("unknown InstClass");
break;
case DS_READ: {
- MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeRead2Pair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
break;
}
case DS_WRITE: {
- MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeWrite2Pair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
break;
}
case S_BUFFER_LOAD_IMM: {
- MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
break;
}
case BUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeBufferLoadPair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case BUFFER_STORE: {
- MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeBufferStorePair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case MIMG: {
- MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeImagePair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case TBUFFER_LOAD: {
- MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeTBufferLoadPair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
case TBUFFER_STORE: {
- MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired);
+ MachineBasicBlock::iterator NewMI =
+ mergeTBufferStorePair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
break;
}
}
+ CI.Order = Paired.Order;
+ if (I == Second)
+ I = Next;
-done:
- // Clear the InstsToMove after we have finished searching so we don't have
- // stale values left over if we search for this CI again in another pass
- // over the block.
- CI.InstsToMove.clear();
+ MergeList.erase(Second);
}
return Modified;
@@ -2062,26 +2168,41 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
TII = STM->getInstrInfo();
TRI = &TII->getRegisterInfo();
- STI = &MF.getSubtarget<MCSubtargetInfo>();
MRI = &MF.getRegInfo();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- assert(MRI->isSSA() && "Must be run on SSA");
-
LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
bool Modified = false;
+ // Contains the list of instructions for which constant offsets are being
+ // promoted to the IMM. This is tracked for an entire block at time.
+ SmallPtrSet<MachineInstr *, 4> AnchorList;
+ MemInfoMap Visited;
for (MachineBasicBlock &MBB : MF) {
- std::list<std::list<CombineInfo> > MergeableInsts;
- // First pass: Collect list of all instructions we know how to merge.
- Modified |= collectMergeableInsts(MBB, MergeableInsts);
- do {
- OptimizeAgain = false;
- Modified |= optimizeBlock(MergeableInsts);
- } while (OptimizeAgain);
+ MachineBasicBlock::iterator SectionEnd;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+ I = SectionEnd) {
+ bool CollectModified;
+ std::list<std::list<CombineInfo>> MergeableInsts;
+
+ // First pass: Collect list of all instructions we know how to merge in a
+ // subset of the block.
+ std::tie(SectionEnd, CollectModified) =
+ collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
+
+ Modified |= CollectModified;
+
+ do {
+ OptimizeAgain = false;
+ Modified |= optimizeBlock(MergeableInsts);
+ } while (OptimizeAgain);
+ }
+
+ Visited.clear();
+ AnchorList.clear();
}
return Modified;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 61d2719a3aad6..36d52ac3ee891 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -38,8 +38,8 @@
/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
///
/// label0:
-/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block
-/// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
+/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then block
+/// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask
/// S_BRANCH_EXECZ label1 // Use our branch optimization
/// // instruction again.
/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
@@ -51,6 +51,8 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -73,6 +75,10 @@ using namespace llvm;
#define DEBUG_TYPE "si-lower-control-flow"
+static cl::opt<bool>
+RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
+ cl::init(true), cl::ReallyHidden);
+
namespace {
class SILowerControlFlow : public MachineFunctionPass {
@@ -81,8 +87,12 @@ private:
const SIInstrInfo *TII = nullptr;
LiveIntervals *LIS = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ SetVector<MachineInstr*> LoweredEndCf;
+ DenseSet<Register> LoweredIf;
+ SmallSet<MachineInstr *, 16> NeedsKillCleanup;
const TargetRegisterClass *BoolRC = nullptr;
+ bool InsertKillCleanups;
unsigned AndOpc;
unsigned OrOpc;
unsigned XorOpc;
@@ -98,13 +108,23 @@ private:
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
- Register getSaveExec(MachineInstr* MI);
-
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
void combineMasks(MachineInstr &MI);
+ void process(MachineInstr &MI);
+
+ // Skip to the next instruction, ignoring debug instructions, and trivial
+ // block boundaries (blocks that have one (typically fallthrough) successor,
+ // and the successor has one predecessor.
+ MachineBasicBlock::iterator
+ skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+
+ // Remove redundant SI_END_CF instructions.
+ void optimizeEndCf();
+
public:
static char ID;
@@ -144,62 +164,44 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
- const SIInstrInfo *TII) {
- Register SaveExecReg = MI.getOperand(0).getReg();
- auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
-
- if (U == MRI->use_instr_nodbg_end() ||
- std::next(U) != MRI->use_instr_nodbg_end() ||
- U->getOpcode() != AMDGPU::SI_END_CF)
- return false;
-
- // Check for SI_KILL_*_TERMINATOR on path from if to endif.
- // if there is any such terminator simplififcations are not safe.
- auto SMBB = MI.getParent();
- auto EMBB = U->getParent();
+static bool hasKill(const MachineBasicBlock *Begin,
+ const MachineBasicBlock *End, const SIInstrInfo *TII) {
DenseSet<const MachineBasicBlock*> Visited;
- SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
- SMBB->succ_end());
+ SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
+ Begin->succ_end());
while (!Worklist.empty()) {
MachineBasicBlock *MBB = Worklist.pop_back_val();
- if (MBB == EMBB || !Visited.insert(MBB).second)
+ if (MBB == End || !Visited.insert(MBB).second)
continue;
- for(auto &Term : MBB->terminators())
+ for (auto &Term : MBB->terminators())
if (TII->isKillTerminator(Term.getOpcode()))
- return false;
+ return true;
Worklist.append(MBB->succ_begin(), MBB->succ_end());
}
- return true;
+ return false;
}
-Register SILowerControlFlow::getSaveExec(MachineInstr *MI) {
- MachineBasicBlock *MBB = MI->getParent();
- MachineOperand &SaveExec = MI->getOperand(0);
- assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister);
-
- Register SaveExecReg = SaveExec.getReg();
- unsigned FalseTermOpc =
- TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
- MachineBasicBlock::iterator I = (MI);
- MachineBasicBlock::iterator J = std::next(I);
- if (J != MBB->end() && J->getOpcode() == FalseTermOpc &&
- J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) {
- SaveExecReg = J->getOperand(0).getReg();
- J->eraseFromParent();
- }
- return SaveExecReg;
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+ Register SaveExecReg = MI.getOperand(0).getReg();
+ auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+
+ if (U == MRI->use_instr_nodbg_end() ||
+ std::next(U) != MRI->use_instr_nodbg_end() ||
+ U->getOpcode() != AMDGPU::SI_END_CF)
+ return false;
+
+ return true;
}
void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
- Register SaveExecReg = getSaveExec(&MI);
+ Register SaveExecReg = MI.getOperand(0).getReg();
MachineOperand& Cond = MI.getOperand(1);
assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
@@ -209,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// If there is only one use of save exec register and that use is SI_END_CF,
// we can optimize SI_IF by returning the full saved exec mask instead of
// just cleared bits.
- bool SimpleIf = isSimpleIf(MI, MRI, TII);
+ bool SimpleIf = isSimpleIf(MI, MRI);
+
+ if (InsertKillCleanups) {
+ // Check for SI_KILL_*_TERMINATOR on full path of control flow and
+ // flag the associated SI_END_CF for insertion of a kill cleanup.
+ auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+ while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
+ assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
+ assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
+ MachineOperand &NextExec = UseMI->getOperand(0);
+ Register NextExecReg = NextExec.getReg();
+ if (NextExec.isDead()) {
+ assert(!SimpleIf);
+ break;
+ }
+ UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
+ }
+ if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
+ if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
+ NeedsKillCleanup.insert(&*UseMI);
+ SimpleIf = false;
+ }
+ }
+ } else if (SimpleIf) {
+ // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+ // if there is any such terminator simplifications are not safe.
+ auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+ SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+ }
// Add an implicit def of exec to discourage scheduling VALU after this which
// will interfere with trying to form s_and_saveexec_b64 later.
@@ -219,6 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
.addReg(Exec)
.addReg(Exec, RegState::ImplicitDefine);
+ LoweredIf.insert(CopyReg);
Register Tmp = MRI->createVirtualRegister(BoolRC);
@@ -282,7 +313,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- Register DstReg = getSaveExec(&MI);
+ Register DstReg = MI.getOperand(0).getReg();
bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
@@ -354,7 +385,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- auto Dst = getSaveExec(&MI);
+ auto Dst = MI.getOperand(0).getReg();
// Skip ANDing with exec if the break condition is already masked by exec
// because it is a V_CMP in the same basic block. (We know the break
@@ -416,6 +447,38 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MI.eraseFromParent();
}
+MachineBasicBlock::iterator
+SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+
+ SmallSet<const MachineBasicBlock *, 4> Visited;
+ MachineBasicBlock *B = &MBB;
+ do {
+ if (!Visited.insert(B).second)
+ return MBB.end();
+
+ auto E = B->end();
+ for ( ; It != E; ++It) {
+ if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
+ continue;
+ if (TII->mayReadEXEC(*MRI, *It))
+ break;
+ }
+
+ if (It != E)
+ return It;
+
+ if (B->succ_size() != 1)
+ return MBB.end();
+
+ // If there is one trivial successor, advance to the next block.
+ MachineBasicBlock *Succ = *B->succ_begin();
+
+ It = Succ->begin();
+ B = Succ;
+ } while (true);
+}
+
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -430,8 +493,20 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
.addReg(Exec)
.add(MI.getOperand(0));
- if (LIS)
+ LoweredEndCf.insert(NewMI);
+
+ // If this ends control flow which contains kills (as flagged in emitIf)
+ // then insert an SI_KILL_CLEANUP immediately following the exec mask
+ // manipulation. This can be lowered to early termination if appropriate.
+ MachineInstr *CleanUpMI = nullptr;
+ if (NeedsKillCleanup.count(&MI))
+ CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
+
+ if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+ if (CleanUpMI)
+ LIS->InsertMachineInstrInMaps(*CleanUpMI);
+ }
MI.eraseFromParent();
@@ -494,6 +569,84 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
MRI->getUniqueVRegDef(Reg)->eraseFromParent();
}
+void SILowerControlFlow::optimizeEndCf() {
+ // If the only instruction immediately following this END_CF is an another
+ // END_CF in the only successor we can avoid emitting exec mask restore here.
+ if (!RemoveRedundantEndcf)
+ return;
+
+ for (MachineInstr *MI : LoweredEndCf) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ auto Next =
+ skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
+ if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
+ continue;
+ // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
+ // If that belongs to SI_ELSE then saved mask has an inverted value.
+ Register SavedExec
+ = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
+ assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
+
+ const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
+ if (Def && LoweredIf.count(SavedExec)) {
+ LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ }
+ }
+}
+
+void SILowerControlFlow::process(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::iterator I(MI);
+ MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_IF:
+ emitIf(MI);
+ break;
+
+ case AMDGPU::SI_ELSE:
+ emitElse(MI);
+ break;
+
+ case AMDGPU::SI_IF_BREAK:
+ emitIfBreak(MI);
+ break;
+
+ case AMDGPU::SI_LOOP:
+ emitLoop(MI);
+ break;
+
+ case AMDGPU::SI_END_CF:
+ emitEndCf(MI);
+ break;
+
+ default:
+ assert(false && "Attempt to process unsupported instruction");
+ break;
+ }
+
+ MachineBasicBlock::iterator Next;
+ for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MaskMI = *I;
+ switch (MaskMI.getOpcode()) {
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
+ // Cleanup bit manipulations on exec mask
+ combineMasks(MaskMI);
+ break;
+ default:
+ I = MBB.end();
+ break;
+ }
+ }
+}
+
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -503,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
+ InsertKillCleanups =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
if (ST.isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
@@ -524,57 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
Exec = AMDGPU::EXEC;
}
+ SmallVector<MachineInstr *, 32> Worklist;
+
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next, Last;
-
- for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
case AMDGPU::SI_IF:
- emitIf(MI);
+ process(MI);
break;
case AMDGPU::SI_ELSE:
- emitElse(MI);
- break;
-
case AMDGPU::SI_IF_BREAK:
- emitIfBreak(MI);
- break;
-
case AMDGPU::SI_LOOP:
- emitLoop(MI);
- break;
-
case AMDGPU::SI_END_CF:
- emitEndCf(MI);
+ // Only build worklist if SI_IF instructions must be processed first.
+ if (InsertKillCleanups)
+ Worklist.push_back(&MI);
+ else
+ process(MI);
break;
- case AMDGPU::S_AND_B64:
- case AMDGPU::S_OR_B64:
- case AMDGPU::S_AND_B32:
- case AMDGPU::S_OR_B32:
- // Cleanup bit manipulations on exec mask
- combineMasks(MI);
- Last = I;
- continue;
-
default:
- Last = I;
- continue;
+ break;
}
-
- // Replay newly inserted code to combine masks
- Next = (Last == MBB.end()) ? MBB.begin() : Last;
}
}
+ for (MachineInstr *MI : Worklist)
+ process(*MI);
+
+ optimizeEndCf();
+
+ LoweredEndCf.clear();
+ LoweredIf.clear();
+ NeedsKillCleanup.clear();
+
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 1d45e6241d225..236a24a02ece0 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -452,6 +452,11 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
/// all others, because phi lowering looks through copies and can therefore
/// often make copy lowering unnecessary.
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+ // Only need to run this in SelectionDAG path.
+ if (TheMF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::Selected))
+ return false;
+
MF = &TheMF;
MRI = &MF->getRegInfo();
DT = &getAnalysis<MachineDominatorTree>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 57ccf7641666b..1349d3b6bf3f6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -100,7 +100,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
unsigned Reg = CS.getReg();
MachineInstrSpan MIS(I, &SaveBlock);
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ const TargetRegisterClass *RC =
+ TRI->getMinimalPhysRegClass(Reg, MVT::i32);
TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
TRI);
@@ -118,7 +119,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
/// Insert restore code for the callee-saved registers used in the function.
static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
- std::vector<CalleeSavedInfo> &CSI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
LiveIntervals *LIS) {
MachineFunction &MF = *RestoreBlock.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -133,7 +134,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
unsigned Reg = CI.getReg();
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ const TargetRegisterClass *RC =
+ TRI->getMinimalPhysRegClass(Reg, MVT::i32);
TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
assert(I != RestoreBlock.begin() &&
@@ -206,10 +208,10 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
for (unsigned I = 0; CSRegs[I]; ++I) {
unsigned Reg = CSRegs[I];
if (SavedRegs.test(Reg)) {
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ const TargetRegisterClass *RC =
+ TRI->getMinimalPhysRegClass(Reg, MVT::i32);
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
- TRI->getSpillAlignment(*RC),
- true);
+ TRI->getSpillAlign(*RC), true);
CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
}
@@ -228,6 +230,47 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}
+// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
+static bool lowerShiftReservedVGPR(MachineFunction &MF,
+ const GCNSubtarget &ST) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ Register LowestAvailableVGPR, ReservedVGPR;
+ ArrayRef<MCPhysReg> AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF);
+ for (MCPhysReg Reg : AllVGPR32s) {
+ if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) {
+ LowestAvailableVGPR = Reg;
+ break;
+ }
+ }
+
+ if (!LowestAvailableVGPR)
+ return false;
+
+ ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
+ const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
+ int i = 0;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
+ if (Reg.VGPR == ReservedVGPR) {
+ MBB.removeLiveIn(ReservedVGPR);
+ MBB.addLiveIn(LowestAvailableVGPR);
+ Optional<int> FI;
+ if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
+ FI = FrameInfo.CreateSpillStackObject(4, Align(4));
+
+ FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i);
+ }
+ ++i;
+ }
+ MBB.sortUniqueLiveIns();
+ }
+
+ return true;
+}
+
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -267,6 +310,9 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
//
// This operates under the assumption that only other SGPR spills are users
// of the frame index.
+
+ lowerShiftReservedVGPR(MF, ST);
+
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock::iterator Next;
for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
@@ -315,6 +361,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
}
MadeChange = true;
+ } else if (FuncInfo->VGPRReservedForSGPRSpill) {
+ FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
}
SaveBlocks.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 0c67b1467a5d2..788e9873f780f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
#include "SIMachineFunctionInfo.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "AMDGPUSubtarget.h"
#include "SIRegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -52,9 +53,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
- Occupancy = ST.computeOccupancy(MF, getLDSSize());
+ Occupancy = ST.computeOccupancy(F, getLDSSize());
CallingConv::ID CC = F.getCallingConv();
+ // FIXME: Should have analysis or something rather than attribute to detect
+ // calls.
+ const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
+
+ // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
+ // have any calls.
+ const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ (!isEntryFunction() || HasCalls);
+
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
if (!F.arg_empty())
KernargSegmentPtr = true;
@@ -68,16 +78,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
- ScratchWaveOffsetReg = AMDGPU::SGPR33;
// TODO: Pick a high register, and shift down, similar to a kernel.
- FrameOffsetReg = AMDGPU::SGPR34;
+ FrameOffsetReg = AMDGPU::SGPR33;
StackPtrOffsetReg = AMDGPU::SGPR32;
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(ScratchRSrcReg);
- ArgInfo.PrivateSegmentWaveByteOffset =
- ArgDescriptor::createRegister(ScratchWaveOffsetReg);
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
@@ -89,27 +96,35 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
}
- if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+ if (UseFixedABI) {
WorkGroupIDX = true;
-
- if (F.hasFnAttribute("amdgpu-work-group-id-y"))
WorkGroupIDY = true;
-
- if (F.hasFnAttribute("amdgpu-work-group-id-z"))
WorkGroupIDZ = true;
-
- if (F.hasFnAttribute("amdgpu-work-item-id-x"))
WorkItemIDX = true;
-
- if (F.hasFnAttribute("amdgpu-work-item-id-y"))
WorkItemIDY = true;
-
- if (F.hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;
+ ImplicitArgPtr = true;
+ } else {
+ if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+ WorkGroupIDX = true;
+
+ if (F.hasFnAttribute("amdgpu-work-group-id-y"))
+ WorkGroupIDY = true;
+
+ if (F.hasFnAttribute("amdgpu-work-group-id-z"))
+ WorkGroupIDZ = true;
+
+ if (F.hasFnAttribute("amdgpu-work-item-id-x"))
+ WorkItemIDX = true;
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- bool HasStackObjects = FrameInfo.hasStackObjects();
+ if (F.hasFnAttribute("amdgpu-work-item-id-y"))
+ WorkItemIDY = true;
+
+ if (F.hasFnAttribute("amdgpu-work-item-id-z"))
+ WorkItemIDZ = true;
+ }
+ bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
if (isEntryFunction()) {
// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
@@ -129,36 +144,34 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (isAmdHsaOrMesa) {
PrivateSegmentBuffer = true;
- if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
+ if (UseFixedABI) {
DispatchPtr = true;
-
- if (F.hasFnAttribute("amdgpu-queue-ptr"))
QueuePtr = true;
- if (F.hasFnAttribute("amdgpu-dispatch-id"))
+ // FIXME: We don't need this?
DispatchID = true;
+ } else {
+ if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
+ DispatchPtr = true;
+
+ if (F.hasFnAttribute("amdgpu-queue-ptr"))
+ QueuePtr = true;
+
+ if (F.hasFnAttribute("amdgpu-dispatch-id"))
+ DispatchID = true;
+ }
} else if (ST.isMesaGfxShader(F)) {
ImplicitBufferPtr = true;
}
- if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+ if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
- auto hasNonSpillStackObjects = [&]() {
- // Avoid expensive checking if there's no stack objects.
- if (!HasStackObjects)
- return false;
- for (auto OI = FrameInfo.getObjectIndexBegin(),
- OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
- if (!FrameInfo.isSpillSlotObjectIndex(OI))
- return true;
- // All stack objects are spill slots.
- return false;
- };
// TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls that may require it before argument lowering.
- if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch"))
+ // detecting calls or stack objects that may require it before argument
+ // lowering.
+ if (HasCalls || HasStackObjects)
FlatScratchInit = true;
}
@@ -184,7 +197,7 @@ void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
MF.getFunction()));
}
-unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
@@ -193,21 +206,21 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
return ArgInfo.PrivateSegmentBuffer.getRegister();
}
-unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return ArgInfo.DispatchPtr.getRegister();
}
-unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return ArgInfo.QueuePtr.getRegister();
}
-unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
ArgInfo.KernargSegmentPtr
= ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
@@ -215,28 +228,29 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI)
return ArgInfo.KernargSegmentPtr.getRegister();
}
-unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return ArgInfo.DispatchID.getRegister();
}
-unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return ArgInfo.FlatScratchInit.getRegister();
}
-unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
NumUserSGPRs += 2;
return ArgInfo.ImplicitBufferPtr.getRegister();
}
-static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
+bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
+ MCPhysReg Reg) {
for (unsigned I = 0; CSRegs[I]; ++I) {
if (CSRegs[I] == Reg)
return true;
@@ -270,22 +284,35 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned WaveSize = ST.getWavefrontSize();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
unsigned Size = FrameInfo.getObjectSize(FI);
- assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
- assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+ unsigned NumLanes = Size / 4;
+
+ if (NumLanes > WaveSize)
+ return false;
- int NumLanes = Size / 4;
+ assert(Size >= 4 && "invalid sgpr spill size");
+ assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
// Make sure to handle the case where a wide SGPR spill may span between two
// VGPRs.
- for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
- unsigned LaneVGPR;
+ for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+ Register LaneVGPR;
unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
- if (VGPRIndex == 0) {
+ // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and
+ // when one of the two conditions is true:
+ // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet
+ // reserved.
+ // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is
+ // required.
+ if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) {
+ assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR);
+ LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill;
+ } else if (VGPRIndex == 0) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
@@ -298,7 +325,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
Optional<int> CSRSpillFI;
if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
isCalleeSavedReg(CSRegs, LaneVGPR)) {
- CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
+ CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
}
SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
@@ -317,6 +344,19 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
return true;
}
+/// Reserve a VGPR for spilling of SGPRs
+bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+ Register LaneVGPR = TRI->findUnusedRegister(
+ MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
+ SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None));
+ FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
+ return true;
+}
+
/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
/// Either AGPR is spilled to VGPR to vice versa.
/// Returns true if a \p FI can be eliminated completely.
@@ -386,9 +426,9 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
}
void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
- // The FP spill hasn't been inserted yet, so keep it around.
+ // The FP & BP spills haven't been inserted yet, so keep them around.
for (auto &R : SGPRToVGPRSpills) {
- if (R.first != FramePointerSaveIndex)
+ if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex)
MFI.RemoveStackObject(R.first);
}
@@ -396,7 +436,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
// ID.
for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
++i)
- if (i != FramePointerSaveIndex)
+ if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
MFI.setStackID(i, TargetStackID::Default);
for (auto &R : VGPRToAGPRSpills) {
@@ -414,7 +454,28 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}
-static yaml::StringValue regToString(unsigned Reg,
+Register
+SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.isAmdPalOS())
+ return Register();
+ Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
+ if (ST.hasMergedShaders()) {
+ switch (MF.getFunction().getCallingConv()) {
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_GS:
+ // Low GIT address is passed in s8 rather than s0 for an LS+HS or
+ // ES+GS merged shader on gfx9+.
+ GitPtrLo = AMDGPU::SGPR8;
+ return GitPtrLo;
+ default:
+ return GitPtrLo;
+ }
+ }
+ return GitPtrLo;
+}
+
+static yaml::StringValue regToString(Register Reg,
const TargetRegisterInfo &TRI) {
yaml::StringValue Dest;
{
@@ -487,7 +548,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
WaveLimiter(MFI.needsWaveLimiter()),
HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
- ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
@@ -509,3 +569,21 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
WaveLimiter = YamlMFI.WaveLimiter;
return false;
}
+
+// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs
+bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
+ MachineFunction &MF) {
+ for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) {
+ if (i->VGPR == ReservedVGPR) {
+ SpillVGPRs.erase(i);
+
+ for (MachineBasicBlock &MBB : MF) {
+ MBB.removeLiveIn(ReservedVGPR);
+ MBB.sortUniqueLiveIns();
+ }
+ this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister;
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ef0186f7d57fe..cf1629fda0aff 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -236,23 +236,29 @@ template <> struct MappingTraits<SIArgumentInfo> {
struct SIMode {
bool IEEE = true;
bool DX10Clamp = true;
- bool FP32Denormals = true;
- bool FP64FP16Denormals = true;
+ bool FP32InputDenormals = true;
+ bool FP32OutputDenormals = true;
+ bool FP64FP16InputDenormals = true;
+ bool FP64FP16OutputDenormals = true;
SIMode() = default;
SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) {
IEEE = Mode.IEEE;
DX10Clamp = Mode.DX10Clamp;
- FP32Denormals = Mode.FP32Denormals;
- FP64FP16Denormals = Mode.FP64FP16Denormals;
+ FP32InputDenormals = Mode.FP32InputDenormals;
+ FP32OutputDenormals = Mode.FP32OutputDenormals;
+ FP64FP16InputDenormals = Mode.FP64FP16InputDenormals;
+ FP64FP16OutputDenormals = Mode.FP64FP16OutputDenormals;
}
bool operator ==(const SIMode Other) const {
return IEEE == Other.IEEE &&
DX10Clamp == Other.DX10Clamp &&
- FP32Denormals == Other.FP32Denormals &&
- FP64FP16Denormals == Other.FP64FP16Denormals;
+ FP32InputDenormals == Other.FP32InputDenormals &&
+ FP32OutputDenormals == Other.FP32OutputDenormals &&
+ FP64FP16InputDenormals == Other.FP64FP16InputDenormals &&
+ FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals;
}
};
@@ -260,8 +266,10 @@ template <> struct MappingTraits<SIMode> {
static void mapping(IO &YamlIO, SIMode &Mode) {
YamlIO.mapOptional("ieee", Mode.IEEE, true);
YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true);
- YamlIO.mapOptional("fp32-denormals", Mode.FP32Denormals, true);
- YamlIO.mapOptional("fp64-fp16-denormals", Mode.FP64FP16Denormals, true);
+ YamlIO.mapOptional("fp32-input-denormals", Mode.FP32InputDenormals, true);
+ YamlIO.mapOptional("fp32-output-denormals", Mode.FP32OutputDenormals, true);
+ YamlIO.mapOptional("fp64-fp16-input-denormals", Mode.FP64FP16InputDenormals, true);
+ YamlIO.mapOptional("fp64-fp16-output-denormals", Mode.FP64FP16OutputDenormals, true);
}
};
@@ -276,7 +284,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
uint32_t HighBitsOf32BitAddress = 0;
StringValue ScratchRSrcReg = "$private_rsrc_reg";
- StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
StringValue FrameOffsetReg = "$fp_reg";
StringValue StackPtrOffsetReg = "$sp_reg";
@@ -303,8 +310,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
StringValue("$private_rsrc_reg"));
- YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg,
- StringValue("$scratch_wave_offset_reg"));
YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
StringValue("$fp_reg"));
YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
@@ -323,20 +328,20 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
friend class GCNTargetMachine;
- unsigned TIDReg = AMDGPU::NoRegister;
+ Register TIDReg = AMDGPU::NoRegister;
// Registers that may be reserved for spilling purposes. These may be the same
// as the input registers.
- unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
- unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG;
+ Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
- // This is the current function's incremented size from the kernel's scratch
- // wave offset register. For an entry function, this is exactly the same as
- // the ScratchWaveOffsetReg.
- unsigned FrameOffsetReg = AMDGPU::FP_REG;
+ // This is the the unswizzled offset from the current dispatch's scratch wave
+ // base to the beginning of the current function's frame.
+ Register FrameOffsetReg = AMDGPU::FP_REG;
- // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
- unsigned StackPtrOffsetReg = AMDGPU::SP_REG;
+ // This is an ABI register used in the non-entry calling convention to
+ // communicate the unswizzled offset from the current dispatch's scratch wave
+ // base to the beginning of the new function's frame.
+ Register StackPtrOffsetReg = AMDGPU::SP_REG;
AMDGPUFunctionArgInfo ArgInfo;
@@ -429,11 +434,11 @@ private:
public:
struct SpilledReg {
- unsigned VGPR = 0;
+ Register VGPR;
int Lane = -1;
SpilledReg() = default;
- SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
+ SpilledReg(Register R, int L) : VGPR (R), Lane (L) {}
bool hasLane() { return Lane != -1;}
bool hasReg() { return VGPR != 0;}
@@ -441,13 +446,13 @@ public:
struct SGPRSpillVGPRCSR {
// VGPR used for SGPR spills
- unsigned VGPR;
+ Register VGPR;
// If the VGPR is a CSR, the stack slot used to save/restore it in the
// prolog/epilog.
Optional<int> FI;
- SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
+ SGPRSpillVGPRCSR(Register V, Optional<int> F) : VGPR(V), FI(F) {}
};
struct VGPRSpillToAGPR {
@@ -457,12 +462,9 @@ public:
SparseBitVector<> WWMReservedRegs;
- void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+ void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); }
private:
- // SGPR->VGPR spilling support.
- using SpillRegMask = std::pair<unsigned, unsigned>;
-
// Track VGPR + wave index for each subregister of the SGPR spilled to
// frameindex key.
DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
@@ -480,9 +482,17 @@ private:
public: // FIXME
/// If this is set, an SGPR used for save/restore of the register used for the
/// frame pointer.
- unsigned SGPRForFPSaveRestoreCopy = 0;
+ Register SGPRForFPSaveRestoreCopy;
Optional<int> FramePointerSaveIndex;
+ /// If this is set, an SGPR used for save/restore of the register used for the
+ /// base pointer.
+ Register SGPRForBPSaveRestoreCopy;
+ Optional<int> BasePointerSaveIndex;
+
+ Register VGPRReservedForSGPRSpill;
+ bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
+
public:
SIMachineFunctionInfo(const MachineFunction &MF);
@@ -498,6 +508,14 @@ public:
return SpillVGPRs;
}
+ void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
+ SpillVGPRs[Index].VGPR = NewVGPR;
+ SpillVGPRs[Index].FI = newFI;
+ VGPRReservedForSGPRSpill = NewVGPR;
+ }
+
+ bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
+
ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
return SpillAGPR;
}
@@ -515,12 +533,13 @@ public:
bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
+ bool reserveVGPRforSGPRSpills(MachineFunction &MF);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
void removeDeadFrameIndices(MachineFrameInfo &MFI);
bool hasCalculatedTID() const { return TIDReg != 0; };
- unsigned getTIDReg() const { return TIDReg; };
- void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+ Register getTIDReg() const { return TIDReg; };
+ void setTIDReg(Register Reg) { TIDReg = Reg; }
unsigned getBytesInStackArgArea() const {
return BytesInStackArgArea;
@@ -531,34 +550,34 @@ public:
}
// Add user SGPRs.
- unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
- unsigned addDispatchPtr(const SIRegisterInfo &TRI);
- unsigned addQueuePtr(const SIRegisterInfo &TRI);
- unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
- unsigned addDispatchID(const SIRegisterInfo &TRI);
- unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
- unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
+ Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+ Register addDispatchPtr(const SIRegisterInfo &TRI);
+ Register addQueuePtr(const SIRegisterInfo &TRI);
+ Register addKernargSegmentPtr(const SIRegisterInfo &TRI);
+ Register addDispatchID(const SIRegisterInfo &TRI);
+ Register addFlatScratchInit(const SIRegisterInfo &TRI);
+ Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
// Add system SGPRs.
- unsigned addWorkGroupIDX() {
+ Register addWorkGroupIDX() {
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDX.getRegister();
}
- unsigned addWorkGroupIDY() {
+ Register addWorkGroupIDY() {
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDY.getRegister();
}
- unsigned addWorkGroupIDZ() {
+ Register addWorkGroupIDZ() {
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupIDZ.getRegister();
}
- unsigned addWorkGroupInfo() {
+ Register addWorkGroupInfo() {
ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.WorkGroupInfo.getRegister();
@@ -577,14 +596,14 @@ public:
ArgInfo.WorkItemIDZ = Arg;
}
- unsigned addPrivateSegmentWaveByteOffset() {
+ Register addPrivateSegmentWaveByteOffset() {
ArgInfo.PrivateSegmentWaveByteOffset
= ArgDescriptor::createRegister(getNextSystemSGPR());
NumSystemSGPRs += 1;
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
- void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+ void setPrivateSegmentWaveByteOffset(Register Reg) {
ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
}
@@ -660,13 +679,13 @@ public:
return ArgInfo;
}
- std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+ std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
return ArgInfo.getPreloadedValue(Value);
}
Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
- auto Arg = ArgInfo.getPreloadedValue(Value).first;
+ auto Arg = std::get<0>(ArgInfo.getPreloadedValue(Value));
return Arg ? Arg->getRegister() : Register();
}
@@ -674,6 +693,8 @@ public:
return GITPtrHigh;
}
+ Register getGITPtrLoReg(const MachineFunction &MF) const;
+
uint32_t get32BitAddressHighBits() const {
return HighBitsOf32BitAddress;
}
@@ -690,35 +711,31 @@ public:
return NumUserSGPRs + NumSystemSGPRs;
}
- unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+ Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
/// Returns the physical register reserved for use as the resource
/// descriptor for scratch accesses.
- unsigned getScratchRSrcReg() const {
+ Register getScratchRSrcReg() const {
return ScratchRSrcReg;
}
- void setScratchRSrcReg(unsigned Reg) {
+ void setScratchRSrcReg(Register Reg) {
assert(Reg != 0 && "Should never be unset");
ScratchRSrcReg = Reg;
}
- unsigned getScratchWaveOffsetReg() const {
- return ScratchWaveOffsetReg;
- }
-
- unsigned getFrameOffsetReg() const {
+ Register getFrameOffsetReg() const {
return FrameOffsetReg;
}
- void setFrameOffsetReg(unsigned Reg) {
+ void setFrameOffsetReg(Register Reg) {
assert(Reg != 0 && "Should never be unset");
FrameOffsetReg = Reg;
}
- void setStackPtrOffsetReg(unsigned Reg) {
+ void setStackPtrOffsetReg(Register Reg) {
assert(Reg != 0 && "Should never be unset");
StackPtrOffsetReg = Reg;
}
@@ -727,20 +744,15 @@ public:
// NoRegister. This is mostly a workaround for MIR tests where state that
// can't be directly computed from the function is not preserved in serialized
// MIR.
- unsigned getStackPtrOffsetReg() const {
+ Register getStackPtrOffsetReg() const {
return StackPtrOffsetReg;
}
- void setScratchWaveOffsetReg(unsigned Reg) {
- assert(Reg != 0 && "Should never be unset");
- ScratchWaveOffsetReg = Reg;
- }
-
- unsigned getQueuePtrUserSGPR() const {
+ Register getQueuePtrUserSGPR() const {
return ArgInfo.QueuePtr.getRegister();
}
- unsigned getImplicitBufferPtrUserSGPR() const {
+ Register getImplicitBufferPtrUserSGPR() const {
return ArgInfo.ImplicitBufferPtr.getRegister();
}
@@ -853,7 +865,7 @@ public:
}
/// \returns SGPR used for \p Dim's work group ID.
- unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+ Register getWorkGroupIDSGPR(unsigned Dim) const {
switch (Dim) {
case 0:
assert(hasWorkGroupIDX());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 004a3cb185d62..3ba05aadbbbee 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -269,8 +269,8 @@ SUnit* SIScheduleBlock::pickNode() {
// Predict register usage after this instruction.
TryCand.SU = SU;
TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
- TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
- TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+ TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
TryCand.HasLowLatencyNonWaitedParent =
@@ -595,10 +595,12 @@ void SIScheduleBlock::printDebug(bool full) {
}
if (Scheduled) {
- dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
- << LiveInPressure[DAG->getVGPRSetID()] << '\n';
- dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
- << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+ dbgs() << "LiveInPressure "
+ << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
+ << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
+ dbgs() << "LiveOutPressure "
+ << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
+ << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
dbgs() << "LiveIns:\n";
for (unsigned Reg : LiveInRegs)
dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
@@ -1637,7 +1639,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
TryCand.VGPRUsageDiff =
checkRegUsageImpact(TryCand.Block->getInRegs(),
- TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+ TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
TryCand.NumHighLatencySuccessors =
TryCand.Block->getNumHighLatencySuccessors();
@@ -1796,9 +1798,6 @@ SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
SITII = static_cast<const SIInstrInfo*>(TII);
SITRI = static_cast<const SIRegisterInfo*>(TRI);
-
- VGPRSetID = SITRI->getVGPRPressureSet();
- SGPRSetID = SITRI->getSGPRPressureSet();
}
SIScheduleDAGMI::~SIScheduleDAGMI() = default;
@@ -1909,9 +1908,9 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
continue;
PSetIterator PSetI = MRI.getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
- if (*PSetI == VGPRSetID)
+ if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
VgprUsage += PSetI.getWeight();
- else if (*PSetI == SGPRSetID)
+ else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
SgprUsage += PSetI.getWeight();
}
}
@@ -1952,10 +1951,11 @@ void SIScheduleDAGMI::schedule()
int64_t OffLatReg;
if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
IsLowLatencySU[i] = 1;
+ bool OffsetIsScalable;
if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
- TRI))
+ OffsetIsScalable, TRI))
LowLatencyOffset[i] = OffLatReg;
- } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
+ } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode()))
IsHighLatencySU[i] = 1;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index ec450a3164674..02e0a3fe1b610 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -435,9 +435,6 @@ class SIScheduleDAGMI final : public ScheduleDAGMILive {
std::vector<unsigned> ScheduledSUnits;
std::vector<unsigned> ScheduledSUnitsInv;
- unsigned VGPRSetID;
- unsigned SGPRSetID;
-
public:
SIScheduleDAGMI(MachineSchedContext *C);
@@ -484,9 +481,6 @@ public:
return OutRegs;
};
- unsigned getVGPRSetID() const { return VGPRSetID; }
- unsigned getSGPRSetID() const { return SGPRSetID; }
-
private:
void topologicalSort();
// After scheduling is done, improve low latency placements.
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e914573306ae0..4e6c72ca20e28 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -254,6 +254,9 @@ protected:
IsaVersion IV;
+ /// Whether to insert cache invalidation instructions.
+ bool InsertCacheInv;
+
SICacheControl(const GCNSubtarget &ST);
public:
@@ -650,6 +653,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
SICacheControl::SICacheControl(const GCNSubtarget &ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
+ InsertCacheInv = !ST.isAmdPalOS();
}
/* static */
@@ -714,6 +718,9 @@ bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -852,6 +859,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -954,6 +964,9 @@ bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
bool Changed = false;
MachineBasicBlock &MBB = *MI->getParent();
@@ -1289,6 +1302,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+
+ if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
+ MachineBasicBlock::instr_iterator II(MI->getIterator());
+ for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
+ I != E && I->isBundledWithPred(); ++I) {
+ I->unbundleFromPred();
+ for (MachineOperand &MO : I->operands())
+ if (MO.isReg())
+ MO.setIsInternalRead(false);
+ }
+
+ MI->eraseFromParent();
+ MI = II->getIterator();
+ }
+
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 52989a280e806..0e162ac42c111 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -83,9 +83,7 @@ struct Status {
return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode);
}
- bool isCombinable(Status &S) {
- return !(Mask & S.Mask) || isCompatible(S);
- }
+ bool isCombinable(Status &S) { return !(Mask & S.Mask) || isCompatible(S); }
};
class BlockData {
@@ -110,7 +108,11 @@ public:
// which is used in Phase 3 if we need to insert a mode change.
MachineInstr *FirstInsertionPoint;
- BlockData() : FirstInsertionPoint(nullptr) {};
+ // A flag to indicate whether an Exit value has been set (we can't tell by
+ // examining the Exit value itself as all values may be valid results).
+ bool ExitSet;
+
+ BlockData() : FirstInsertionPoint(nullptr), ExitSet(false){};
};
namespace {
@@ -131,6 +133,8 @@ public:
Status DefaultStatus =
Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode));
+ bool Changed = false;
+
public:
SIModeRegister() : MachineFunctionPass(ID) {}
@@ -201,6 +205,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
(Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
(AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
++NumSetregInserted;
+ Changed = true;
InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
}
}
@@ -325,24 +330,53 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
// exit value is propagated.
void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
const SIInstrInfo *TII) {
-// BlockData *BI = BlockInfo[MBB.getNumber()];
+ bool RevisitRequired = false;
+ bool ExitSet = false;
unsigned ThisBlock = MBB.getNumber();
if (MBB.pred_empty()) {
// There are no predecessors, so use the default starting status.
BlockInfo[ThisBlock]->Pred = DefaultStatus;
+ ExitSet = true;
} else {
// Build a status that is common to all the predecessors by intersecting
// all the predecessor exit status values.
+ // Mask bits (which represent the Mode bits with a known value) can only be
+ // added by explicit SETREG instructions or the initial default value -
+ // the intersection process may remove Mask bits.
+ // If we find a predecessor that has not yet had an exit value determined
+ // (this can happen for example if a block is its own predecessor) we defer
+ // use of that value as the Mask will be all zero, and we will revisit this
+ // block again later (unless the only predecessor without an exit value is
+ // this block).
MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end();
MachineBasicBlock &PB = *(*P);
- BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit;
+ unsigned PredBlock = PB.getNumber();
+ if ((ThisBlock == PredBlock) && (std::next(P) == E)) {
+ BlockInfo[ThisBlock]->Pred = DefaultStatus;
+ ExitSet = true;
+ } else if (BlockInfo[PredBlock]->ExitSet) {
+ BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit;
+ ExitSet = true;
+ } else if (PredBlock != ThisBlock)
+ RevisitRequired = true;
for (P = std::next(P); P != E; P = std::next(P)) {
MachineBasicBlock *Pred = *P;
- BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit);
+ unsigned PredBlock = Pred->getNumber();
+ if (BlockInfo[PredBlock]->ExitSet) {
+ if (BlockInfo[ThisBlock]->ExitSet) {
+ BlockInfo[ThisBlock]->Pred =
+ BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[PredBlock]->Exit);
+ } else {
+ BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit;
+ }
+ ExitSet = true;
+ } else if (PredBlock != ThisBlock)
+ RevisitRequired = true;
}
}
- Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
+ Status TmpStatus =
+ BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
if (BlockInfo[ThisBlock]->Exit != TmpStatus) {
BlockInfo[ThisBlock]->Exit = TmpStatus;
// Add the successors to the work list so we can propagate the changed exit
@@ -354,6 +388,9 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
Phase2List.push(&B);
}
}
+ BlockInfo[ThisBlock]->ExitSet = ExitSet;
+ if (RevisitRequired)
+ Phase2List.push(&MBB);
}
// In Phase 3 we revisit each block and if it has an insertion point defined we
@@ -361,10 +398,10 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
// not we insert an appropriate setreg instruction to modify the Mode register.
void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
const SIInstrInfo *TII) {
-// BlockData *BI = BlockInfo[MBB.getNumber()];
unsigned ThisBlock = MBB.getNumber();
if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) {
- Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
+ Status Delta =
+ BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
if (BlockInfo[ThisBlock]->FirstInsertionPoint)
insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta);
else
@@ -401,5 +438,5 @@ bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
BlockInfo.clear();
- return NumSetregInserted > 0;
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 34199d3e425c4..8af00fcf62a82 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -7,15 +7,8 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// This pass removes redundant S_OR_B64 instructions enabling lanes in
-/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
-/// vector instructions between them we can only keep outer SI_END_CF, given
-/// that CFG is structured and exec bits of the outer end statement are always
-/// not less than exec bit of the inner one.
-///
-/// This needs to be done before the RA to eliminate saved exec bits registers
-/// but after register coalescer to have no vector registers copies in between
-/// of different end cf statements.
+/// This pass performs exec mask handling peephole optimizations which needs
+/// to be done before register allocation to reduce register pressure.
///
//===----------------------------------------------------------------------===//
@@ -40,14 +33,6 @@ private:
MachineRegisterInfo *MRI;
public:
- MachineBasicBlock::iterator skipIgnoreExecInsts(
- MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const;
-
- MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc(
- MachineBasicBlock *&MBB,
- MachineBasicBlock::iterator It) const;
-
-public:
static char ID;
SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
@@ -83,93 +68,15 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
return new SIOptimizeExecMaskingPreRA();
}
-static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI,
- const GCNSubtarget &ST) {
- if (ST.isWave32()) {
- return MI.getOpcode() == AMDGPU::S_OR_B32 &&
- MI.modifiesRegister(AMDGPU::EXEC_LO, TRI);
- }
-
- return MI.getOpcode() == AMDGPU::S_OR_B64 &&
- MI.modifiesRegister(AMDGPU::EXEC, TRI);
-}
-
static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) {
- assert(MI.isFullCopy());
+ if (MI.isFullCopy() && MI.getOperand(1).getReg() == Exec)
return true;
- }
return false;
}
-static unsigned getOrNonExecReg(const MachineInstr &MI,
- const SIInstrInfo &TII,
- const GCNSubtarget& ST) {
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Op->isReg() && Op->getReg() != Exec)
- return Op->getReg();
- Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
- if (Op->isReg() && Op->getReg() != Exec)
- return Op->getReg();
- return AMDGPU::NoRegister;
-}
-
-static MachineInstr* getOrExecSource(const MachineInstr &MI,
- const SIInstrInfo &TII,
- const MachineRegisterInfo &MRI,
- const GCNSubtarget& ST) {
- auto SavedExec = getOrNonExecReg(MI, TII, ST);
- if (SavedExec == AMDGPU::NoRegister)
- return nullptr;
- auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
- if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST))
- return nullptr;
- return SaveExecInst;
-}
-
-/// Skip over instructions that don't care about the exec mask.
-MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts(
- MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const {
- for ( ; I != E; ++I) {
- if (TII->mayReadEXEC(*MRI, *I))
- break;
- }
-
- return I;
-}
-
-// Skip to the next instruction, ignoring debug instructions, and trivial block
-// boundaries (blocks that have one (typically fallthrough) successor, and the
-// successor has one predecessor.
-MachineBasicBlock::iterator
-SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc(
- MachineBasicBlock *&MBB,
- MachineBasicBlock::iterator It) const {
-
- do {
- It = skipIgnoreExecInsts(It, MBB->end());
- if (It != MBB->end() || MBB->succ_size() != 1)
- break;
-
- // If there is one trivial successor, advance to the next block.
- MachineBasicBlock *Succ = *MBB->succ_begin();
-
- // TODO: Is this really necessary?
- if (!MBB->isLayoutSuccessor(Succ))
- break;
-
- It = Succ->begin();
- MBB = Succ;
- } while (true);
-
- return It;
-}
-
-
// Optimize sequence
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
// %cmp = V_CMP_NE_U32 1, %1
@@ -261,6 +168,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
And->getOperand(0).getReg())
.addReg(ExecReg)
.addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
+ MachineOperand &AndSCC = And->getOperand(3);
+ assert(AndSCC.getReg() == AMDGPU::SCC);
+ MachineOperand &Andn2SCC = Andn2->getOperand(3);
+ assert(Andn2SCC.getReg() == AMDGPU::SCC);
+ Andn2SCC.setIsDead(AndSCC.isDead());
And->eraseFromParent();
LIS->InsertMachineInstrInMaps(*Andn2);
@@ -379,57 +291,30 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- // Try to collapse adjacent endifs.
- auto E = MBB.end();
- auto Lead = skipDebugInstructionsForward(MBB.begin(), E);
- if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST))
- continue;
-
- MachineBasicBlock *TmpMBB = &MBB;
- auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead));
- if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) ||
- !getOrExecSource(*NextLead, *TII, MRI, ST))
- continue;
-
- LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
-
- auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST);
- unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST);
- for (auto &Op : Lead->operands()) {
- if (Op.isReg())
- RecalcRegs.insert(Op.getReg());
- }
-
- LIS->RemoveMachineInstrFromMaps(*Lead);
- Lead->eraseFromParent();
- if (SaveExecReg) {
- LIS->removeInterval(SaveExecReg);
- LIS->createAndComputeVirtRegInterval(SaveExecReg);
- }
-
- Changed = true;
-
- // If the only use of saved exec in the removed instruction is S_AND_B64
- // fold the copy now.
- if (!SaveExec || !SaveExec->isFullCopy())
- continue;
+ // If the only user of a logical operation is move to exec, fold it now
+ // to prevent forming of saveexec. I.e:
+ //
+ // %0:sreg_64 = COPY $exec
+ // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
+ // =>
+ // %1 = S_AND_B64 $exec, %2:sreg_64
+ unsigned ScanThreshold = 10;
+ for (auto I = MBB.rbegin(), E = MBB.rend(); I != E
+ && ScanThreshold--; ++I) {
+ if (!isFullExecCopy(*I, ST))
+ continue;
- Register SavedExec = SaveExec->getOperand(0).getReg();
- bool SafeToReplace = true;
- for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
- if (U.getParent() != SaveExec->getParent()) {
- SafeToReplace = false;
- break;
+ Register SavedExec = I->getOperand(0).getReg();
+ if (SavedExec.isVirtual() && MRI.hasOneNonDBGUse(SavedExec) &&
+ MRI.use_instr_nodbg_begin(SavedExec)->getParent() == I->getParent()) {
+ LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
+ LIS->RemoveMachineInstrFromMaps(*I);
+ I->eraseFromParent();
+ MRI.replaceRegWith(SavedExec, Exec);
+ LIS->removeInterval(SavedExec);
+ Changed = true;
}
-
- LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
- }
-
- if (SafeToReplace) {
- LIS->RemoveMachineInstrFromMaps(*SaveExec);
- SaveExec->eraseFromParent();
- MRI.replaceRegWith(SavedExec, Exec);
- LIS->removeInterval(SavedExec);
+ break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 05c81feb23ecd..9a1855c3458be 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -244,11 +244,6 @@ static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
return OS;
}
-static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
- Operand.print(OS);
- return OS;
-}
-
LLVM_DUMP_METHOD
void SDWASrcOperand::print(raw_ostream& OS) const {
OS << "SDWA src: " << *getTargetOperand()
@@ -850,6 +845,13 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::unique_ptr<SDWAOperand>(nullptr);
}
+#if !defined(NDEBUG)
+static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
+ Operand.print(OS);
+ return OS;
+}
+#endif
+
void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
for (MachineInstr &MI : MBB) {
if (auto Operand = matchSDWAOperand(MI)) {
@@ -920,18 +922,24 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
if (I->modifiesRegister(AMDGPU::VCC, TRI))
return;
}
+
// Make the two new e32 instruction variants.
// Replace MI with V_{SUB|ADD}_I32_e32
- auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
- NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
- NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
- NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
+ .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
+ .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
+ .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
+ .setMIFlags(MI.getFlags());
+
MI.eraseFromParent();
+
// Replace MISucc with V_{SUBB|ADDC}_U32_e32
- auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
- NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
- NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
- NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+ BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
+ .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
+ .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
+ .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
+ .setMIFlags(MISucc.getFlags());
+
MISucc.eraseFromParent();
}
@@ -1008,7 +1016,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// Create SDWA version of instruction MI and initialize its operands
MachineInstrBuilder SDWAInst =
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
+ .setMIFlags(MI.getFlags());
// Copy dst, if it is present in original then should also be present in SDWA
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
new file mode 100644
index 0000000000000..4c72fa2359750
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -0,0 +1,139 @@
+//===-- SIPostRABundler.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of memory instructions to protect adjacent loads
+/// and stores from beeing rescheduled apart from each other post-RA.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-post-ra-bundler"
+
+namespace {
+
+class SIPostRABundler : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIPostRABundler() : MachineFunctionPass(ID) {
+ initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI post-RA bundler";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ const SIRegisterInfo *TRI;
+
+ SmallSet<Register, 16> Defs;
+
+ bool isDependentLoad(const MachineInstr &MI) const;
+
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
+
+char SIPostRABundler::ID = 0;
+
+char &llvm::SIPostRABundlerID = SIPostRABundler::ID;
+
+FunctionPass *llvm::createSIPostRABundlerPass() {
+ return new SIPostRABundler();
+}
+
+bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
+ if (!MI.mayLoad())
+ return false;
+
+ for (const MachineOperand &Op : MI.explicit_operands()) {
+ if (!Op.isReg())
+ continue;
+ Register Reg = Op.getReg();
+ for (Register Def : Defs)
+ if (TRI->regsOverlap(Reg, Def))
+ return true;
+ }
+
+ return false;
+}
+
+bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ bool Changed = false;
+ const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
+ SIInstrFlags::SMRD | SIInstrFlags::DS |
+ SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::instr_iterator Next;
+ MachineBasicBlock::instr_iterator B = MBB.instr_begin();
+ MachineBasicBlock::instr_iterator E = MBB.instr_end();
+ for (auto I = B; I != E; I = Next) {
+ Next = std::next(I);
+
+ const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags;
+
+ if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() ||
+ B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
+ ((B->getDesc().TSFlags & MemFlags) != IMemFlags) ||
+ isDependentLoad(*I)) {
+
+ if (B != I) {
+ if (std::next(B) != I) {
+ finalizeBundle(MBB, B, I);
+ Changed = true;
+ }
+ Next = I;
+ }
+
+ B = Next;
+ Defs.clear();
+ continue;
+ }
+
+ if (I->getNumExplicitDefs() == 0)
+ continue;
+
+ Defs.insert(I->defs().begin()->getReg());
+ }
+
+ if (B != E && std::next(B) != E) {
+ finalizeBundle(MBB, B, E);
+ Changed = true;
+ }
+
+ Defs.clear();
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
new file mode 100644
index 0000000000000..f31c722db1b26
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -0,0 +1,326 @@
+//===-- SIPreEmitPeephole.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass performs the peephole optimizations before code emission.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-emit-peephole"
+
+namespace {
+
+class SIPreEmitPeephole : public MachineFunctionPass {
+private:
+ const SIInstrInfo *TII = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+
+ bool optimizeVccBranch(MachineInstr &MI) const;
+ bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+
+public:
+ static char ID;
+
+ SIPreEmitPeephole() : MachineFunctionPass(ID) {
+ initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
+ "SI peephole optimizations", false, false)
+
+char SIPreEmitPeephole::ID = 0;
+
+char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
+
+bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
+ // Match:
+ // sreg = -1 or 0
+ // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
+ // S_CBRANCH_VCC[N]Z
+ // =>
+ // S_CBRANCH_EXEC[N]Z
+ // We end up with this pattern sometimes after basic block placement.
+ // It happens while combining a block which assigns -1 or 0 to a saved mask
+ // and another block which consumes that saved mask and then a branch.
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI.getParent();
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const bool IsWave32 = ST.isWave32();
+ const unsigned CondReg = TRI->getVCC();
+ const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+
+ MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+ E = MBB.rend();
+ bool ReadsCond = false;
+ unsigned Threshold = 5;
+ for (++A; A != E; ++A) {
+ if (!--Threshold)
+ return false;
+ if (A->modifiesRegister(ExecReg, TRI))
+ return false;
+ if (A->modifiesRegister(CondReg, TRI)) {
+ if (!A->definesRegister(CondReg, TRI) ||
+ (A->getOpcode() != And && A->getOpcode() != AndN2))
+ return false;
+ break;
+ }
+ ReadsCond |= A->readsRegister(CondReg, TRI);
+ }
+ if (A == E)
+ return false;
+
+ MachineOperand &Op1 = A->getOperand(1);
+ MachineOperand &Op2 = A->getOperand(2);
+ if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+ TII->commuteInstruction(*A);
+ Changed = true;
+ }
+ if (Op1.getReg() != ExecReg)
+ return Changed;
+ if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
+ return Changed;
+
+ int64_t MaskValue = 0;
+ Register SReg;
+ if (Op2.isReg()) {
+ SReg = Op2.getReg();
+ auto M = std::next(A);
+ bool ReadsSreg = false;
+ for (; M != E; ++M) {
+ if (M->definesRegister(SReg, TRI))
+ break;
+ if (M->modifiesRegister(SReg, TRI))
+ return Changed;
+ ReadsSreg |= M->readsRegister(SReg, TRI);
+ }
+ if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
+ (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
+ return Changed;
+ MaskValue = M->getOperand(1).getImm();
+ // First if sreg is only used in the AND instruction fold the immediate
+ // into into the AND.
+ if (!ReadsSreg && Op2.isKill()) {
+ A->getOperand(2).ChangeToImmediate(MaskValue);
+ M->eraseFromParent();
+ }
+ } else if (Op2.isImm()) {
+ MaskValue = Op2.getImm();
+ } else {
+ llvm_unreachable("Op2 must be register or immediate");
+ }
+
+ // Invert mask for s_andn2
+ assert(MaskValue == 0 || MaskValue == -1);
+ if (A->getOpcode() == AndN2)
+ MaskValue = ~MaskValue;
+
+ if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+ MI.killsRegister(CondReg, TRI))
+ A->eraseFromParent();
+
+ bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+ if (SReg == ExecReg) {
+ // EXEC is updated directly
+ if (IsVCCZ) {
+ MI.eraseFromParent();
+ return true;
+ }
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else if (IsVCCZ && MaskValue == 0) {
+ // Will always branch
+ // Remove all succesors shadowed by new unconditional branch
+ MachineBasicBlock *Parent = MI.getParent();
+ SmallVector<MachineInstr *, 4> ToRemove;
+ bool Found = false;
+ for (MachineInstr &Term : Parent->terminators()) {
+ if (Found) {
+ if (Term.isBranch())
+ ToRemove.push_back(&Term);
+ } else {
+ Found = Term.isIdenticalTo(MI);
+ }
+ }
+ assert(Found && "conditional branch is not terminator");
+ for (auto BranchMI : ToRemove) {
+ MachineOperand &Dst = BranchMI->getOperand(0);
+ assert(Dst.isMBB() && "destination is not basic block");
+ Parent->removeSuccessor(Dst.getMBB());
+ BranchMI->eraseFromParent();
+ }
+
+ if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
+ Parent->removeSuccessor(Succ);
+ }
+
+ // Rewrite to unconditional branch
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else if (!IsVCCZ && MaskValue == 0) {
+ // Will never branch
+ MachineOperand &Dst = MI.getOperand(0);
+ assert(Dst.isMBB() && "destination is not basic block");
+ MI.getParent()->removeSuccessor(Dst.getMBB());
+ MI.eraseFromParent();
+ return true;
+ } else if (MaskValue == -1) {
+ // Depends only on EXEC
+ MI.setDesc(
+ TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
+ }
+
+ MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+ MI.addImplicitDefUseOperands(*MBB.getParent());
+
+ return true;
+}
+
+bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
+ MachineInstr &MI) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const MachineFunction &MF = *MBB.getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
+ SmallVector<MachineInstr *, 4> ToRemove;
+ bool IdxOn = true;
+
+ if (!MI.isIdenticalTo(First))
+ return false;
+
+ // Scan back to find an identical S_SET_GPR_IDX_ON
+ for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
+ E = MI.getIterator(); I != E; ++I) {
+ switch (I->getOpcode()) {
+ case AMDGPU::S_SET_GPR_IDX_MODE:
+ return false;
+ case AMDGPU::S_SET_GPR_IDX_OFF:
+ IdxOn = false;
+ ToRemove.push_back(&*I);
+ break;
+ default:
+ if (I->modifiesRegister(AMDGPU::M0, TRI))
+ return false;
+ if (IdxReg && I->modifiesRegister(IdxReg, TRI))
+ return false;
+ if (llvm::any_of(I->operands(),
+ [&MRI, this](const MachineOperand &MO) {
+ return MO.isReg() &&
+ TRI->isVectorRegister(MRI, MO.getReg());
+ })) {
+ // The only exception allowed here is another indirect vector move
+ // with the same mode.
+ if (!IdxOn ||
+ !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
+ I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
+ return false;
+ }
+ }
+ }
+
+ MI.eraseFromParent();
+ for (MachineInstr *RI : ToRemove)
+ RI->eraseFromParent();
+ return true;
+}
+
+bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
+ if (MBBE != MBB.end()) {
+ MachineInstr &MI = *MBBE;
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_CBRANCH_VCCZ:
+ case AMDGPU::S_CBRANCH_VCCNZ:
+ Changed |= optimizeVccBranch(MI);
+ continue;
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ // FIXME: This is not an optimization and should be
+ // moved somewhere else.
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (&MBB != &MF.back() || &MI != &MBB.back()) {
+ // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
+ // at the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ MI.eraseFromParent();
+ MBBE = MBB.getFirstTerminator();
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!ST.hasVGPRIndexMode())
+ continue;
+
+ MachineInstr *SetGPRMI = nullptr;
+ const unsigned Threshold = 20;
+ unsigned Count = 0;
+ // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
+ // second is not needed. Do expensive checks in the optimizeSetGPR()
+ // and limit the distance to 20 instructions for compile time purposes.
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
+ MachineInstr &MI = *MBBI;
+ ++MBBI;
+
+ if (Count == Threshold)
+ SetGPRMI = nullptr;
+ else
+ ++Count;
+
+ if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
+ continue;
+
+ Count = 0;
+ if (!SetGPRMI) {
+ SetGPRMI = &MI;
+ continue;
+ }
+
+ if (optimizeSetGPR(*SetGPRMI, MI))
+ Changed = true;
+ else
+ SetGPRMI = &MI;
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index fbadad3c84ad8..5d6009ebf3843 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -26,27 +26,12 @@
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
+#include <vector>
using namespace llvm;
-static bool hasPressureSet(const int *PSets, unsigned PSetID) {
- for (unsigned i = 0; PSets[i] != -1; ++i) {
- if (PSets[i] == (int)PSetID)
- return true;
- }
- return false;
-}
-
-void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
- BitVector &PressureSets) const {
- for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
- const int *PSets = getRegUnitPressureSets(*U);
- if (hasPressureSet(PSets, PSetID)) {
- PressureSets.set(PSetID);
- break;
- }
- }
-}
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
@@ -54,90 +39,200 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
cl::ReallyHidden,
cl::init(true));
-SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
- AMDGPURegisterInfo(),
- ST(ST),
- SGPRPressureSets(getNumRegPressureSets()),
- VGPRPressureSets(getNumRegPressureSets()),
- AGPRPressureSets(getNumRegPressureSets()),
- SpillSGPRToVGPR(EnableSpillSGPRToVGPR),
- isWave32(ST.isWave32()) {
- unsigned NumRegPressureSets = getNumRegPressureSets();
-
- SGPRSetID = NumRegPressureSets;
- VGPRSetID = NumRegPressureSets;
- AGPRSetID = NumRegPressureSets;
-
- for (unsigned i = 0; i < NumRegPressureSets; ++i) {
- classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
- classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
- classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
- }
-
- // Determine the number of reg units for each pressure set.
- std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
- for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
- const int *PSets = getRegUnitPressureSets(i);
- for (unsigned j = 0; PSets[j] != -1; ++j) {
- ++PressureSetRegUnits[PSets[j]];
+std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+
+SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
+ : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
+ SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
+
+ assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
+ getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
+ (getSubRegIndexLaneMask(AMDGPU::lo16) |
+ getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
+ getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
+ "getNumCoveredRegs() will not work with generated subreg masks!");
+
+ RegPressureIgnoredUnits.resize(getNumRegUnits());
+ RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
+ for (auto Reg : AMDGPU::VGPR_HI16RegClass)
+ RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
+
+ // HACK: Until this is fully tablegen'd.
+ static llvm::once_flag InitializeRegSplitPartsFlag;
+
+ static auto InitializeRegSplitPartsOnce = [this]() {
+ for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
+ unsigned Size = getSubRegIdxSize(Idx);
+ if (Size & 31)
+ continue;
+ std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
+ unsigned Pos = getSubRegIdxOffset(Idx);
+ if (Pos % Size)
+ continue;
+ Pos /= Size;
+ if (Vec.empty()) {
+ unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
+ Vec.resize(MaxNumParts);
+ }
+ Vec[Pos] = Idx;
}
+ };
+
+
+ llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
+}
+
+void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
+ MCRegister Reg) const {
+ MCRegAliasIterator R(Reg, this, true);
+
+ for (; R.isValid(); ++R)
+ Reserved.set(*R);
+}
+
+// Forced to be here by one .inc
+const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
+ const MachineFunction *MF) const {
+ CallingConv::ID CC = MF->getFunction().getCallingConv();
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::Cold:
+ return CSR_AMDGPU_HighRegs_SaveList;
+ default: {
+ // Dummy to not crash RegisterClassInfo.
+ static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
+ return &NoCalleeSavedReg;
}
+ }
+}
- unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
- for (unsigned i = 0; i < NumRegPressureSets; ++i) {
- if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
- VGPRSetID = i;
- VGPRMax = PressureSetRegUnits[i];
- continue;
- }
- if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
- SGPRSetID = i;
- SGPRMax = PressureSetRegUnits[i];
- }
- if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
- AGPRSetID = i;
- AGPRMax = PressureSetRegUnits[i];
- continue;
- }
+const MCPhysReg *
+SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+ return nullptr;
+}
+
+const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+ CallingConv::ID CC) const {
+ switch (CC) {
+ case CallingConv::C:
+ case CallingConv::Fast:
+ case CallingConv::Cold:
+ return CSR_AMDGPU_HighRegs_RegMask;
+ default:
+ return nullptr;
}
+}
- assert(SGPRSetID < NumRegPressureSets &&
- VGPRSetID < NumRegPressureSets &&
- AGPRSetID < NumRegPressureSets);
+Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ const SIFrameLowering *TFI =
+ MF.getSubtarget<GCNSubtarget>().getFrameLowering();
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ // During ISel lowering we always reserve the stack pointer in entry
+ // functions, but never actually want to reference it when accessing our own
+ // frame. If we need a frame pointer we use it, but otherwise we can just use
+ // an immediate "0" which we represent by returning NoRegister.
+ if (FuncInfo->isEntryFunction()) {
+ return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
+ }
+ return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
+ : FuncInfo->getStackPtrOffsetReg();
}
-unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
- const MachineFunction &MF) const {
- unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
- unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
- return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
+bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+ // When we need stack realignment, we can't reference off of the
+ // stack pointer, so we reserve a base pointer.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ return MFI.getNumFixedObjects() && needsStackRealignment(MF);
}
-static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
- unsigned Reg;
+Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
- // Try to place it in a hole after PrivateSegmentBufferReg.
- if (RegCount & 3) {
- // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
- // alignment constraints, so we have a hole where can put the wave offset.
- Reg = RegCount - 1;
- } else {
- // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
- // wave offset before it.
- Reg = RegCount - 5;
- }
+const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
+ return CSR_AMDGPU_AllVGPRs_RegMask;
+}
- return Reg;
+const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
+ return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
}
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+// FIXME: TableGen should generate something to make this manageable for all
+// register classes. At a minimum we could use the opposite of
+// composeSubRegIndices and go up from the base 32-bit subreg.
+unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
+ unsigned NumRegs) {
+ // Table of NumRegs sized pieces at every 32-bit offset.
+ static const uint16_t SubRegFromChannelTable[][32] = {
+ {AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+ AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+ AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31},
+ {AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3,
+ AMDGPU::sub3_sub4, AMDGPU::sub4_sub5, AMDGPU::sub5_sub6,
+ AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, AMDGPU::sub8_sub9,
+ AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
+ AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15,
+ AMDGPU::sub15_sub16, AMDGPU::sub16_sub17, AMDGPU::sub17_sub18,
+ AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, AMDGPU::sub20_sub21,
+ AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
+ AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27,
+ AMDGPU::sub27_sub28, AMDGPU::sub28_sub29, AMDGPU::sub29_sub30,
+ AMDGPU::sub30_sub31, AMDGPU::NoSubRegister},
+ {AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3,
+ AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
+ AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7,
+ AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
+ AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11,
+ AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
+ AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15,
+ AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
+ AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19,
+ AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
+ AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23,
+ AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
+ AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27,
+ AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
+ AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31,
+ AMDGPU::NoSubRegister, AMDGPU::NoSubRegister},
+ {AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4,
+ AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
+ AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8,
+ AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
+ AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12,
+ AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
+ AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16,
+ AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
+ AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20,
+ AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
+ AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24,
+ AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
+ AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28,
+ AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
+ AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister,
+ AMDGPU::NoSubRegister, AMDGPU::NoSubRegister}};
+
+ const unsigned NumRegIndex = NumRegs - 1;
+
+ assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
+ "Not implemented");
+ assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
+ return SubRegFromChannelTable[NumRegIndex][Channel];
+}
+
+MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
- unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
- return AMDGPU::SGPR_32RegClass.getRegister(Reg);
+ unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
+ MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
}
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
+ Reserved.set(AMDGPU::MODE);
// EXEC_LO and EXEC_HI could be allocated and used as regular register, but
// this seems likely to result in bugs, so I'm marking them as reserved.
@@ -205,6 +300,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
+ for (auto Reg : AMDGPU::SReg_32RegClass) {
+ Reserved.set(getSubReg(Reg, AMDGPU::hi16));
+ Register Low = getSubReg(Reg, AMDGPU::lo16);
+ // This is to prevent BB vcc liveness errors.
+ if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
+ Reserved.set(Low);
+ }
+
+ for (auto Reg : AMDGPU::AGPR_32RegClass) {
+ Reserved.set(getSubReg(Reg, AMDGPU::hi16));
+ }
+
// Reserve all the rest AGPRs if there are no instructions to use it.
if (!ST.hasMAIInsts()) {
for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
@@ -215,38 +322,37 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
- if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
- // Reserve 1 SGPR for scratch wave offset in case we need to spill.
- reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
- }
-
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
// to spill.
// TODO: May need to reserve a VGPR if doing LDS spilling.
reserveRegisterTuples(Reserved, ScratchRSrcReg);
- assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
// We have to assume the SP is needed in case there are calls in the function,
// which is detected after the function is lowered. If we aren't really going
// to need SP, don't bother reserving it.
- unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+ MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
- if (StackPtrReg != AMDGPU::NoRegister) {
+ if (StackPtrReg) {
reserveRegisterTuples(Reserved, StackPtrReg);
assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
}
- unsigned FrameReg = MFI->getFrameOffsetReg();
- if (FrameReg != AMDGPU::NoRegister) {
+ MCRegister FrameReg = MFI->getFrameOffsetReg();
+ if (FrameReg) {
reserveRegisterTuples(Reserved, FrameReg);
assert(!isSubRegister(ScratchRSrcReg, FrameReg));
}
- for (unsigned Reg : MFI->WWMReservedRegs) {
+ if (hasBasePointer(MF)) {
+ MCRegister BasePtrReg = getBaseRegister();
+ reserveRegisterTuples(Reserved, BasePtrReg);
+ assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
+ }
+
+ for (MCRegister Reg : MFI->WWMReservedRegs) {
reserveRegisterTuples(Reserved, Reg);
}
@@ -257,6 +363,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
reserveRegisterTuples(Reserved, Reg);
+ if (MFI->VGPRReservedForSGPRSpill)
+ for (auto SSpill : MFI->getSGPRSpillVGPRs())
+ reserveRegisterTuples(Reserved, SSpill.VGPR);
+
return Reserved;
}
@@ -305,11 +415,6 @@ bool SIRegisterInfo::requiresVirtualBaseRegisters(
return true;
}
-bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
- // This helps catch bugs as verifier errors.
- return true;
-}
-
int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
assert(SIInstrInfo::isMUBUF(*MI));
@@ -340,7 +445,7 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
}
void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
- unsigned BaseReg,
+ Register BaseReg,
int FrameIdx,
int64_t Offset) const {
MachineBasicBlock::iterator Ins = MBB->begin();
@@ -374,7 +479,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
.addImm(0); // clamp bit
}
-void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -411,7 +516,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
}
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
- unsigned BaseReg,
+ Register BaseReg,
int64_t Offset) const {
if (!SIInstrInfo::isMUBUF(*MI))
return false;
@@ -451,6 +556,11 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V256_SAVE:
case AMDGPU::SI_SPILL_V256_RESTORE:
return 8;
+ case AMDGPU::SI_SPILL_S192_SAVE:
+ case AMDGPU::SI_SPILL_S192_RESTORE:
+ case AMDGPU::SI_SPILL_V192_SAVE:
+ case AMDGPU::SI_SPILL_V192_RESTORE:
+ return 6;
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_V160_SAVE:
@@ -614,10 +724,10 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
int Index,
- unsigned ValueReg,
+ Register ValueReg,
bool IsKill,
- unsigned ScratchRsrcReg,
- unsigned ScratchOffsetReg,
+ MCRegister ScratchRsrcReg,
+ MCRegister ScratchOffsetReg,
int64_t InstOffset,
MachineMemOperand *MMO,
RegScavenger *RS) const {
@@ -625,13 +735,14 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
MachineFunction *MF = MI->getParent()->getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
+ const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
const MCInstrDesc &Desc = TII->get(LoadStoreOp);
const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = Desc.mayStore();
bool Scavenged = false;
- unsigned SOffset = ScratchOffsetReg;
+ MCRegister SOffset = ScratchOffsetReg;
const unsigned EltSize = 4;
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
@@ -640,7 +751,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
int64_t ScratchOffsetRegDelta = 0;
- unsigned Align = MFI.getObjectAlignment(Index);
+ Align Alignment = MFI.getObjectAlign(Index);
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
Register TmpReg =
@@ -650,7 +761,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
if (!isUInt<12>(Offset + Size - EltSize)) {
- SOffset = AMDGPU::NoRegister;
+ SOffset = MCRegister();
// We currently only support spilling VGPRs to EltSize boundaries, meaning
// we can simplify the adjustment of Offset here to just scale with
@@ -662,23 +773,33 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
if (RS)
SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
- if (SOffset == AMDGPU::NoRegister) {
+ if (!SOffset) {
// There are no free SGPRs, and since we are in the process of spilling
// VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
// on SI/CI and on VI it is true until we implement spilling using scalar
// stores), we have no way to free up an SGPR. Our solution here is to
- // add the offset directly to the ScratchOffset register, and then
- // subtract the offset after the spill to return ScratchOffset to it's
- // original value.
+ // add the offset directly to the ScratchOffset or StackPtrOffset
+ // register, and then subtract the offset after the spill to return the
+ // register to it's original value.
+ if (!ScratchOffsetReg)
+ ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
SOffset = ScratchOffsetReg;
ScratchOffsetRegDelta = Offset;
} else {
Scavenged = true;
}
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
- .addReg(ScratchOffsetReg)
- .addImm(Offset);
+ if (!SOffset)
+ report_fatal_error("could not scavenge SGPR to spill in entry function");
+
+ if (ScratchOffsetReg == AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+ .addReg(ScratchOffsetReg)
+ .addImm(Offset);
+ }
Offset = 0;
}
@@ -708,21 +829,26 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
- MachineMemOperand *NewMMO
- = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
- EltSize, MinAlign(Align, EltSize * i));
+ MachineMemOperand *NewMMO =
+ MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
+ commonAlignment(Alignment, EltSize * i));
MIB = BuildMI(*MBB, MI, DL, Desc)
- .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
- .addReg(ScratchRsrcReg)
- .addReg(SOffset, SOffsetRegState)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addImm(0) // dlc
- .addImm(0) // swz
- .addMemOperand(NewMMO);
+ .addReg(SubReg,
+ getDefRegState(!IsStore) | getKillRegState(IsKill))
+ .addReg(ScratchRsrcReg);
+ if (SOffset == AMDGPU::NoRegister) {
+ MIB.addImm(0);
+ } else {
+ MIB.addReg(SOffset, SOffsetRegState);
+ }
+ MIB.addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addImm(0) // swz
+ .addMemOperand(NewMMO);
if (!IsStore && TmpReg != AMDGPU::NoRegister)
MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
@@ -736,12 +862,124 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
if (ScratchOffsetRegDelta != 0) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
- .addReg(ScratchOffsetReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
+ .addReg(SOffset)
.addImm(ScratchOffsetRegDelta);
}
}
+// Generate a VMEM access which loads or stores the VGPR containing an SGPR
+// spill such that all the lanes set in VGPRLanes are loaded or stored.
+// This generates exec mask manipulation and will use SGPRs available in MI
+// or VGPR lanes in the VGPR to save and restore the exec mask.
+void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
+ int Index, int Offset,
+ unsigned EltSize, Register VGPR,
+ int64_t VGPRLanes,
+ RegScavenger *RS,
+ bool IsLoad) const {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ Register SuperReg = MI->getOperand(0).getReg();
+ const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+ ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+ unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ unsigned FirstPart = Offset * 32;
+ unsigned ExecLane = 0;
+
+ bool IsKill = MI->getOperand(0).isKill();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ // Cannot handle load/store to EXEC
+ assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+ SuperReg != AMDGPU::EXEC && "exec should never spill");
+
+ // On Wave32 only handle EXEC_LO.
+ // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
+ bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
+
+ unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ Register SavedExecReg;
+
+ // Backup EXEC
+ if (OnlyExecLo) {
+ SavedExecReg = NumSubRegs == 1
+ ? SuperReg
+ : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]);
+ } else {
+ // If src/dst is an odd size it is possible subreg0 is not aligned.
+ for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
+ SavedExecReg = getMatchingSuperReg(
+ getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
+ &AMDGPU::SReg_64_XEXECRegClass);
+ if (SavedExecReg)
+ break;
+ }
+ }
+ assert(SavedExecReg);
+ BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
+
+ // Setup EXEC
+ BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
+
+ // Load/store VGPR
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
+
+ Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
+ ? getBaseRegister()
+ : getFrameRegister(*MF);
+
+ Align Alignment = FrameInfo.getObjectAlign(Index);
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(*MF, Index);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
+ EltSize, Alignment);
+
+ if (IsLoad) {
+ buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ Index,
+ VGPR, false,
+ MFI->getScratchRSrcReg(), FrameReg,
+ Offset * EltSize, MMO,
+ RS);
+ } else {
+ buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR,
+ IsKill, MFI->getScratchRSrcReg(), FrameReg,
+ Offset * EltSize, MMO, RS);
+ // This only ever adds one VGPR spill
+ MFI->addToSpilledVGPRs(1);
+ }
+
+ // Restore EXEC
+ BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
+ .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
+
+ // Restore clobbered SGPRs
+ if (IsLoad) {
+ // Nothing to do; register will be overwritten
+ } else if (!IsKill) {
+ // Restore SGPRs from appropriate VGPR lanes
+ if (!OnlyExecLo) {
+ BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
+ .addReg(VGPR)
+ .addImm(ExecLane + 1);
+ }
+ BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ NumSubRegs == 1
+ ? SavedExecReg
+ : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]))
+ .addReg(VGPR, RegState::Kill)
+ .addImm(ExecLane);
+ }
+}
+
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
@@ -749,7 +987,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
+ DenseSet<Register> SGPRSpillVGPRDefinedSet;
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
@@ -763,13 +1001,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();
- MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
- SuperReg != MFI->getFrameOffsetReg() &&
- SuperReg != MFI->getScratchWaveOffsetReg()));
+ SuperReg != MFI->getFrameOffsetReg()));
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+ assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+ SuperReg != AMDGPU::EXEC && "exec should never spill");
unsigned EltSize = 4;
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
@@ -777,17 +1014,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- // Scavenged temporary VGPR to use. It must be scavenged once for any number
- // of spilled subregs.
- Register TmpVGPR;
-
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg =
- NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToVGPR) {
+ if (SpillToVGPR) {
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
// During SGPR spilling to VGPR, determine if the VGPR is defined. The
@@ -809,42 +1039,53 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
// it are fixed.
- } else {
- // XXX - Can to VGPR spill fail for some subregisters but not others?
- if (OnlyToVGPR)
- return false;
-
- // Spill SGPR to a frame index.
- if (!TmpVGPR.isValid())
- TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
- MachineInstrBuilder Mov
- = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
- .addReg(SubReg, SubKillState);
-
- // There could be undef components of a spilled super register.
- // TODO: Can we detect this and skip the spill?
- if (NumSubRegs > 1) {
- // The last implicit use of the SuperReg carries the "Kill" flag.
- unsigned SuperKillState = 0;
- if (i + 1 == e)
- SuperKillState |= getKillRegState(IsKill);
- Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+ }
+ } else {
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ RS->setRegUsed(TmpVGPR);
+
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+
+ unsigned PerVGPR = 32;
+ unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
+ int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+
+ for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+ unsigned TmpVGPRFlags = RegState::Undef;
+
+ // Write sub registers into the VGPR
+ for (unsigned i = Offset * PerVGPR,
+ e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+ i < e; ++i) {
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+ MachineInstrBuilder WriteLane =
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ TmpVGPR)
+ .addReg(SubReg, SubKillState)
+ .addImm(i % PerVGPR)
+ .addReg(TmpVGPR, TmpVGPRFlags);
+ TmpVGPRFlags = 0;
+
+ // There could be undef components of a spilled super register.
+ // TODO: Can we detect this and skip the spill?
+ if (NumSubRegs > 1) {
+ // The last implicit use of the SuperReg carries the "Kill" flag.
+ unsigned SuperKillState = 0;
+ if (i + 1 == NumSubRegs)
+ SuperKillState |= getKillRegState(IsKill);
+ WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
+ }
}
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- EltSize, MinAlign(Align, EltSize * i));
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpVGPR, RegState::Kill) // src
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srrsrc
- .addReg(MFI->getStackPtrOffsetReg()) // soffset
- .addImm(i * 4) // offset
- .addMemOperand(MMO);
+ // Write out VGPR
+ buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
+ RS, false);
}
}
@@ -867,13 +1108,14 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
if (OnlyToVGPR && !SpillToVGPR)
return false;
- MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
Register SuperReg = MI->getOperand(0).getReg();
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+ assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+ SuperReg != AMDGPU::EXEC && "exec should never spill");
unsigned EltSize = 4;
@@ -882,52 +1124,49 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- Register TmpVGPR;
-
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- Register SubReg =
- NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+ if (SpillToVGPR) {
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
- if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
auto MIB =
BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
SubReg)
.addReg(Spill.VGPR)
.addImm(Spill.Lane);
-
if (NumSubRegs > 1 && i == 0)
MIB.addReg(SuperReg, RegState::ImplicitDefine);
- } else {
- if (OnlyToVGPR)
- return false;
-
- // Restore SGPR from a stack slot.
- // FIXME: We should use S_LOAD_DWORD here for VI.
- if (!TmpVGPR.isValid())
- TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
- unsigned Align = FrameInfo.getObjectAlignment(Index);
-
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
-
- MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
- MachineMemOperand::MOLoad, EltSize,
- MinAlign(Align, EltSize * i));
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srsrc
- .addReg(MFI->getStackPtrOffsetReg()) // soffset
- .addImm(i * 4) // offset
- .addMemOperand(MMO);
-
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpVGPR, RegState::Kill);
-
- if (NumSubRegs > 1)
- MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ }
+ } else {
+ Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ RS->setRegUsed(TmpVGPR);
+
+ unsigned PerVGPR = 32;
+ unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
+ int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+
+ for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+ // Load in VGPR data
+ buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
+ RS, true);
+
+ // Unpack lanes
+ for (unsigned i = Offset * PerVGPR,
+ e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+ i < e; ++i) {
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+ bool LastSubReg = (i + 1 == e);
+ auto MIB =
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(TmpVGPR, getKillRegState(LastSubReg))
+ .addImm(i);
+ if (NumSubRegs > 1 && i == 0)
+ MIB.addReg(SuperReg, RegState::ImplicitDefine);
+ }
}
}
@@ -946,6 +1185,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S96_SAVE:
@@ -955,6 +1195,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -981,13 +1222,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
- Register FrameReg = getFrameRegister(*MF);
+ Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
+ ? getBaseRegister()
+ : getFrameRegister(*MF);
switch (MI->getOpcode()) {
// SGPR register spill
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S192_SAVE:
case AMDGPU::SI_SPILL_S160_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
case AMDGPU::SI_SPILL_S96_SAVE:
@@ -1001,6 +1245,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S192_RESTORE:
case AMDGPU::SI_SPILL_S160_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -1076,42 +1321,30 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
bool IsMUBUF = TII->isMUBUF(*MI);
if (!IsMUBUF && !MFI->isEntryFunction()) {
- // Convert to an absolute stack address by finding the offset from the
- // scratch wave base and scaling by the wave size.
+ // Convert to a swizzled stack address by scaling by the wave size.
//
- // In an entry function/kernel the offset is already the absolute
- // address relative to the frame register.
-
- Register TmpDiffReg =
- RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
-
- // If there's no free SGPR, in-place modify the FP
- Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
+ // In an entry function/kernel the offset is already swizzled.
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
- Register ResultReg = IsCopy ?
- MI->getOperand(0).getReg() :
- RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
- .addReg(FrameReg)
- .addReg(MFI->getScratchWaveOffsetReg());
+ Register ResultReg =
+ IsCopy ? MI->getOperand(0).getReg()
+ : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
int64_t Offset = FrameInfo.getObjectOffset(Index);
if (Offset == 0) {
// XXX - This never happens because of emergency scavenging slot at 0?
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
.addImm(ST.getWavefrontSizeLog2())
- .addReg(DiffReg);
+ .addReg(FrameReg);
} else {
if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
- Register ScaledReg =
- RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
+ // Reuse ResultReg in intermediate step.
+ Register ScaledReg = ResultReg;
BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
ScaledReg)
.addImm(ST.getWavefrontSizeLog2())
- .addReg(DiffReg, RegState::Kill);
+ .addReg(FrameReg);
const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
@@ -1148,10 +1381,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// unavailable. Only one additional mov is needed.
Register TmpScaledReg =
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
- Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+ Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
- .addReg(DiffReg, RegState::Kill)
+ .addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
@@ -1165,19 +1398,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(ScaledReg, RegState::Kill)
.addImm(Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
- .addReg(DiffReg, RegState::Kill)
+ .addReg(FrameReg)
.addImm(ST.getWavefrontSizeLog2());
}
}
}
- if (!TmpDiffReg.isValid()) {
- // Restore the FP.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
- .addReg(FrameReg)
- .addReg(MFI->getScratchWaveOffsetReg());
- }
-
// Don't introduce an extra copy if we're just materializing in a mov.
if (IsCopy)
MI->eraseFromParent();
@@ -1192,10 +1418,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::vaddr));
- assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
- MFI->getStackPtrOffsetReg());
-
- TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
+ auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
+ assert((SOffset.isReg() &&
+ SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
+ (SOffset.isImm() && SOffset.getImm() == 0));
+ if (SOffset.isReg()) {
+ if (FrameReg == AMDGPU::NoRegister) {
+ SOffset.ChangeToImmediate(0);
+ } else {
+ SOffset.setReg(FrameReg);
+ }
+ }
int64_t Offset = FrameInfo.getObjectOffset(Index);
int64_t OldImm
@@ -1224,16 +1457,99 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
}
-StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
+StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
return AMDGPUInstPrinter::getRegisterName(Reg);
}
+const TargetRegisterClass *
+SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth == 1)
+ return &AMDGPU::VReg_1RegClass;
+ if (BitWidth <= 16)
+ return &AMDGPU::VGPR_LO16RegClass;
+ if (BitWidth <= 32)
+ return &AMDGPU::VGPR_32RegClass;
+ if (BitWidth <= 64)
+ return &AMDGPU::VReg_64RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::VReg_96RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::VReg_128RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::VReg_160RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::VReg_192RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::VReg_256RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::VReg_512RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::VReg_1024RegClass;
+
+ return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 16)
+ return &AMDGPU::AGPR_LO16RegClass;
+ if (BitWidth <= 32)
+ return &AMDGPU::AGPR_32RegClass;
+ if (BitWidth <= 64)
+ return &AMDGPU::AReg_64RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::AReg_96RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::AReg_128RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::AReg_160RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::AReg_192RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::AReg_256RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::AReg_512RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::AReg_1024RegClass;
+
+ return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
+ if (BitWidth <= 16)
+ return &AMDGPU::SGPR_LO16RegClass;
+ if (BitWidth <= 32)
+ return &AMDGPU::SReg_32RegClass;
+ if (BitWidth <= 64)
+ return &AMDGPU::SReg_64RegClass;
+ if (BitWidth <= 96)
+ return &AMDGPU::SGPR_96RegClass;
+ if (BitWidth <= 128)
+ return &AMDGPU::SGPR_128RegClass;
+ if (BitWidth <= 160)
+ return &AMDGPU::SGPR_160RegClass;
+ if (BitWidth <= 192)
+ return &AMDGPU::SGPR_192RegClass;
+ if (BitWidth <= 256)
+ return &AMDGPU::SGPR_256RegClass;
+ if (BitWidth <= 512)
+ return &AMDGPU::SGPR_512RegClass;
+ if (BitWidth <= 1024)
+ return &AMDGPU::SGPR_1024RegClass;
+
+ return nullptr;
+}
+
// FIXME: This is very slow. It might be worth creating a map from physreg to
// register class.
-const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
- assert(!Register::isVirtualRegister(Reg));
-
+const TargetRegisterClass *
+SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
static const TargetRegisterClass *const BaseClasses[] = {
+ &AMDGPU::VGPR_LO16RegClass,
+ &AMDGPU::VGPR_HI16RegClass,
+ &AMDGPU::SReg_LO16RegClass,
+ &AMDGPU::AGPR_LO16RegClass,
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::AGPR_32RegClass,
@@ -1242,13 +1558,19 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
&AMDGPU::AReg_64RegClass,
&AMDGPU::VReg_96RegClass,
&AMDGPU::SReg_96RegClass,
+ &AMDGPU::AReg_96RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
&AMDGPU::AReg_128RegClass,
&AMDGPU::VReg_160RegClass,
&AMDGPU::SReg_160RegClass,
+ &AMDGPU::AReg_160RegClass,
+ &AMDGPU::VReg_192RegClass,
+ &AMDGPU::SReg_192RegClass,
+ &AMDGPU::AReg_192RegClass,
&AMDGPU::VReg_256RegClass,
&AMDGPU::SReg_256RegClass,
+ &AMDGPU::AReg_256RegClass,
&AMDGPU::VReg_512RegClass,
&AMDGPU::SReg_512RegClass,
&AMDGPU::AReg_512RegClass,
@@ -1272,122 +1594,54 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC);
- switch (Size) {
- case 32:
- return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
- case 64:
- return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
- case 96:
- return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
- case 128:
- return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
- case 160:
- return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
- case 256:
- return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
- case 512:
- return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
- case 1024:
- return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
- case 1:
- return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr;
- default:
+ if (Size == 16) {
+ return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
+ getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
+ }
+ const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+ if (!VRC) {
assert(Size < 32 && "Invalid register class size");
return false;
}
+ return getCommonSubClass(VRC, RC) != nullptr;
}
bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC);
- if (Size < 32)
+ if (Size < 16)
return false;
- switch (Size) {
- case 32:
- return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
- case 64:
- return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
- case 96:
+ const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
+ if (!ARC) {
+ assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
return false;
- case 128:
- return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
- case 160:
- case 256:
- return false;
- case 512:
- return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
- case 1024:
- return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
- default:
- llvm_unreachable("Invalid register class size");
}
+ return getCommonSubClass(ARC, RC) != nullptr;
}
-const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
- const TargetRegisterClass *SRC) const {
- switch (getRegSizeInBits(*SRC)) {
- case 32:
- return &AMDGPU::VGPR_32RegClass;
- case 64:
- return &AMDGPU::VReg_64RegClass;
- case 96:
- return &AMDGPU::VReg_96RegClass;
- case 128:
- return &AMDGPU::VReg_128RegClass;
- case 160:
- return &AMDGPU::VReg_160RegClass;
- case 256:
- return &AMDGPU::VReg_256RegClass;
- case 512:
- return &AMDGPU::VReg_512RegClass;
- case 1024:
- return &AMDGPU::VReg_1024RegClass;
- case 1:
- return &AMDGPU::VReg_1RegClass;
- default:
- llvm_unreachable("Invalid register class size");
- }
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
+ unsigned Size = getRegSizeInBits(*SRC);
+ const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+ assert(VRC && "Invalid register class size");
+ return VRC;
}
-const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
- const TargetRegisterClass *SRC) const {
- switch (getRegSizeInBits(*SRC)) {
- case 32:
- return &AMDGPU::AGPR_32RegClass;
- case 64:
- return &AMDGPU::AReg_64RegClass;
- case 128:
- return &AMDGPU::AReg_128RegClass;
- case 512:
- return &AMDGPU::AReg_512RegClass;
- case 1024:
- return &AMDGPU::AReg_1024RegClass;
- default:
- llvm_unreachable("Invalid register class size");
- }
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
+ unsigned Size = getRegSizeInBits(*SRC);
+ const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
+ assert(ARC && "Invalid register class size");
+ return ARC;
}
-const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
- const TargetRegisterClass *VRC) const {
- switch (getRegSizeInBits(*VRC)) {
- case 32:
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
+ unsigned Size = getRegSizeInBits(*VRC);
+ if (Size == 32)
return &AMDGPU::SGPR_32RegClass;
- case 64:
- return &AMDGPU::SReg_64RegClass;
- case 96:
- return &AMDGPU::SReg_96RegClass;
- case 128:
- return &AMDGPU::SGPR_128RegClass;
- case 160:
- return &AMDGPU::SReg_160RegClass;
- case 256:
- return &AMDGPU::SReg_256RegClass;
- case 512:
- return &AMDGPU::SReg_512RegClass;
- case 1024:
- return &AMDGPU::SReg_1024RegClass;
- default:
- llvm_unreachable("Invalid register class size");
- }
+ const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
+ assert(SRC && "Invalid register class size");
+ return SRC;
}
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -1396,62 +1650,19 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return RC;
// We can assume that each lane corresponds to one 32-bit register.
- unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
+ unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
if (isSGPRClass(RC)) {
- switch (Count) {
- case 1:
- return &AMDGPU::SGPR_32RegClass;
- case 2:
- return &AMDGPU::SReg_64RegClass;
- case 3:
- return &AMDGPU::SReg_96RegClass;
- case 4:
- return &AMDGPU::SGPR_128RegClass;
- case 5:
- return &AMDGPU::SReg_160RegClass;
- case 8:
- return &AMDGPU::SReg_256RegClass;
- case 16:
- return &AMDGPU::SReg_512RegClass;
- case 32: /* fall-through */
- default:
- llvm_unreachable("Invalid sub-register class size");
- }
+ if (Size == 32)
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ RC = getSGPRClassForBitWidth(Size);
} else if (hasAGPRs(RC)) {
- switch (Count) {
- case 1:
- return &AMDGPU::AGPR_32RegClass;
- case 2:
- return &AMDGPU::AReg_64RegClass;
- case 4:
- return &AMDGPU::AReg_128RegClass;
- case 16:
- return &AMDGPU::AReg_512RegClass;
- case 32: /* fall-through */
- default:
- llvm_unreachable("Invalid sub-register class size");
- }
+ RC = getAGPRClassForBitWidth(Size);
} else {
- switch (Count) {
- case 1:
- return &AMDGPU::VGPR_32RegClass;
- case 2:
- return &AMDGPU::VReg_64RegClass;
- case 3:
- return &AMDGPU::VReg_96RegClass;
- case 4:
- return &AMDGPU::VReg_128RegClass;
- case 5:
- return &AMDGPU::VReg_160RegClass;
- case 8:
- return &AMDGPU::VReg_256RegClass;
- case 16:
- return &AMDGPU::VReg_512RegClass;
- case 32: /* fall-through */
- default:
- llvm_unreachable("Invalid sub-register class size");
- }
+ RC = getVGPRClassForBitWidth(Size);
}
+ assert(RC && "Invalid sub-register class size");
+ return RC;
}
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
@@ -1487,215 +1698,60 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
return getCommonSubClass(DefRC, SrcRC) != nullptr;
}
-/// Returns a register that is not used at any point in the function.
+/// Returns a lowest register that is not used at any point in the function.
/// If all registers are used, then this function will return
-// AMDGPU::NoRegister.
-unsigned
-SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineFunction &MF) const {
-
- for (unsigned Reg : *RC)
- if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
- return Reg;
- return AMDGPU::NoRegister;
+/// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
+/// highest unused register.
+MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC,
+ const MachineFunction &MF,
+ bool ReserveHighestVGPR) const {
+ if (ReserveHighestVGPR) {
+ for (MCRegister Reg : reverse(*RC))
+ if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
+ return Reg;
+ } else {
+ for (MCRegister Reg : *RC)
+ if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
+ return Reg;
+ }
+ return MCRegister();
}
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const {
- if (EltSize == 4) {
- static const int16_t Sub0_31[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
- AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
- AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
- AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
- AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
- };
-
- static const int16_t Sub0_15[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
- };
-
- static const int16_t Sub0_7[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
- };
-
- static const int16_t Sub0_4[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
- };
-
- static const int16_t Sub0_3[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- };
-
- static const int16_t Sub0_2[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
- };
-
- static const int16_t Sub0_1[] = {
- AMDGPU::sub0, AMDGPU::sub1,
- };
-
- switch (AMDGPU::getRegBitWidth(*RC->MC)) {
- case 32:
- return {};
- case 64:
- return makeArrayRef(Sub0_1);
- case 96:
- return makeArrayRef(Sub0_2);
- case 128:
- return makeArrayRef(Sub0_3);
- case 160:
- return makeArrayRef(Sub0_4);
- case 256:
- return makeArrayRef(Sub0_7);
- case 512:
- return makeArrayRef(Sub0_15);
- case 1024:
- return makeArrayRef(Sub0_31);
- default:
- llvm_unreachable("unhandled register size");
- }
- }
-
- if (EltSize == 8) {
- static const int16_t Sub0_31_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
- AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
- AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
- AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
- AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
- AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
- AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
- AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
- };
-
- static const int16_t Sub0_15_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
- AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
- AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
- AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
- };
-
- static const int16_t Sub0_7_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
- AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
- };
-
-
- static const int16_t Sub0_3_64[] = {
- AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
- };
-
- switch (AMDGPU::getRegBitWidth(*RC->MC)) {
- case 64:
- return {};
- case 128:
- return makeArrayRef(Sub0_3_64);
- case 256:
- return makeArrayRef(Sub0_7_64);
- case 512:
- return makeArrayRef(Sub0_15_64);
- case 1024:
- return makeArrayRef(Sub0_31_64);
- default:
- llvm_unreachable("unhandled register size");
- }
- }
+ const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
+ assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
- if (EltSize == 16) {
-
- static const int16_t Sub0_31_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7,
- AMDGPU::sub8_sub9_sub10_sub11,
- AMDGPU::sub12_sub13_sub14_sub15,
- AMDGPU::sub16_sub17_sub18_sub19,
- AMDGPU::sub20_sub21_sub22_sub23,
- AMDGPU::sub24_sub25_sub26_sub27,
- AMDGPU::sub28_sub29_sub30_sub31
- };
-
- static const int16_t Sub0_15_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7,
- AMDGPU::sub8_sub9_sub10_sub11,
- AMDGPU::sub12_sub13_sub14_sub15
- };
-
- static const int16_t Sub0_7_128[] = {
- AMDGPU::sub0_sub1_sub2_sub3,
- AMDGPU::sub4_sub5_sub6_sub7
- };
-
- switch (AMDGPU::getRegBitWidth(*RC->MC)) {
- case 128:
- return {};
- case 256:
- return makeArrayRef(Sub0_7_128);
- case 512:
- return makeArrayRef(Sub0_15_128);
- case 1024:
- return makeArrayRef(Sub0_31_128);
- default:
- llvm_unreachable("unhandled register size");
- }
- }
-
- assert(EltSize == 32 && "unhandled elt size");
-
- static const int16_t Sub0_31_256[] = {
- AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
- AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
- AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
- AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
- };
+ const unsigned RegDWORDs = RegBitWidth / 32;
+ const unsigned EltDWORDs = EltSize / 4;
+ assert(RegSplitParts.size() + 1 >= EltDWORDs);
- static const int16_t Sub0_15_256[] = {
- AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
- AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
- };
+ const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
+ const unsigned NumParts = RegDWORDs / EltDWORDs;
- switch (AMDGPU::getRegBitWidth(*RC->MC)) {
- case 256:
- return {};
- case 512:
- return makeArrayRef(Sub0_15_256);
- case 1024:
- return makeArrayRef(Sub0_31_256);
- default:
- llvm_unreachable("unhandled register size");
- }
+ return makeArrayRef(Parts.data(), NumParts);
}
const TargetRegisterClass*
SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
- unsigned Reg) const {
- if (Register::isVirtualRegister(Reg))
- return MRI.getRegClass(Reg);
-
- return getPhysRegClass(Reg);
+ Register Reg) const {
+ return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
}
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
- unsigned Reg) const {
- const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
- assert(RC && "Register class for the reg not found");
- return hasVGPRs(RC);
+ Register Reg) const {
+ const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
+ // Registers without classes are unaddressable, SGPR-like registers.
+ return RC && hasVGPRs(RC);
}
bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
- unsigned Reg) const {
- const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
- assert(RC && "Register class for the reg not found");
- return hasAGPRs(RC);
+ Register Reg) const {
+ const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
+
+ // Registers without classes are unaddressable, SGPR-like registers.
+ return RC && hasAGPRs(RC);
}
bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -1727,36 +1783,41 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MF.getFunction());
switch (RC->getID()) {
default:
- return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
+ return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID:
+ case AMDGPU::VGPR_LO16RegClassID:
+ case AMDGPU::VGPR_HI16RegClassID:
return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID:
+ case AMDGPU::SGPR_LO16RegClassID:
return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
}
}
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
- if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
+ if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
+ Idx == AMDGPU::RegisterPressureSets::AGPR_32)
return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
const_cast<MachineFunction &>(MF));
- if (Idx == getSGPRPressureSet())
+ if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
const_cast<MachineFunction &>(MF));
- return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
+ llvm_unreachable("Unexpected register pressure set!");
}
const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
static const int Empty[] = { -1 };
- if (hasRegUnit(AMDGPU::M0, RegUnit))
+ if (RegPressureIgnoredUnits[RegUnit])
return Empty;
- return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
+
+ return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}
-unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
+MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
}
@@ -1765,49 +1826,19 @@ const TargetRegisterClass *
SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
const RegisterBank &RB,
const MachineRegisterInfo &MRI) const {
- switch (Size) {
- case 1: {
- switch (RB.getID()) {
- case AMDGPU::VGPRRegBankID:
- return &AMDGPU::VGPR_32RegClass;
- case AMDGPU::VCCRegBankID:
- return isWave32 ?
- &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
- case AMDGPU::SGPRRegBankID:
- return &AMDGPU::SReg_32RegClass;
- default:
- llvm_unreachable("unknown register bank");
- }
- }
- case 32:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32RegClass;
- case 64:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
- &AMDGPU::SReg_64RegClass;
- case 96:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
- &AMDGPU::SReg_96RegClass;
- case 128:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
- &AMDGPU::SGPR_128RegClass;
- case 160:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
- &AMDGPU::SReg_160RegClass;
- case 256:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
- &AMDGPU::SReg_256RegClass;
- case 512:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
- &AMDGPU::SReg_512RegClass;
- case 1024:
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass :
- &AMDGPU::SReg_1024RegClass;
+ switch (RB.getID()) {
+ case AMDGPU::VGPRRegBankID:
+ return getVGPRClassForBitWidth(std::max(32u, Size));
+ case AMDGPU::VCCRegBankID:
+ assert(Size == 1);
+ return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+ : &AMDGPU::SReg_64_XEXECRegClass;
+ case AMDGPU::SGPRRegBankID:
+ return getSGPRClassForBitWidth(std::max(32u, Size));
+ case AMDGPU::AGPRRegBankID:
+ return getAGPRClassForBitWidth(std::max(32u, Size));
default:
- if (Size < 32)
- return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32RegClass;
- return nullptr;
+ llvm_unreachable("unknown register bank");
}
}
@@ -1822,7 +1853,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
return getAllocatableClass(RC);
}
-unsigned SIRegisterInfo::getVCC() const {
+MCRegister SIRegisterInfo::getVCC() const {
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
}
@@ -1837,12 +1868,12 @@ SIRegisterInfo::getRegClass(unsigned RCID) const {
case -1:
return nullptr;
default:
- return AMDGPURegisterInfo::getRegClass(RCID);
+ return AMDGPUGenRegisterInfo::getRegClass(RCID);
}
}
// Find reaching register definition
-MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
+MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
MachineRegisterInfo &MRI,
LiveIntervals *LIS) const {
@@ -1850,7 +1881,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
SlotIndex UseIdx = LIS->getInstructionIndex(Use);
SlotIndex DefIdx;
- if (Register::isVirtualRegister(Reg)) {
+ if (Reg.isVirtual()) {
if (!LIS->hasInterval(Reg))
return nullptr;
LiveInterval &LI = LIS->getInterval(Reg);
@@ -1894,3 +1925,49 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
return Def;
}
+
+MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
+ assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
+
+ for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
+ AMDGPU::SReg_32RegClass,
+ AMDGPU::AGPR_32RegClass } ) {
+ if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
+ return Super;
+ }
+ if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
+ &AMDGPU::VGPR_32RegClass)) {
+ return Super;
+ }
+
+ return AMDGPU::NoRegister;
+}
+
+bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+ switch (PhysReg) {
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::SRC_SHARED_BASE:
+ case AMDGPU::SRC_PRIVATE_BASE:
+ case AMDGPU::SRC_SHARED_LIMIT:
+ case AMDGPU::SRC_PRIVATE_LIMIT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
+ return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
+ ST.getMaxNumSGPRs(MF) / 4);
+}
+
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
+}
+
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllVGPR32(const MachineFunction &MF) const {
+ return makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), ST.getMaxNumVGPRs(MF));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index ac8c56fa3a038..62d9f1174337b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -14,7 +14,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
-#include "AMDGPURegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "AMDGPUGenRegisterInfo.inc"
+
#include "SIDefines.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -22,38 +24,38 @@ namespace llvm {
class GCNSubtarget;
class LiveIntervals;
-class MachineRegisterInfo;
class SIMachineFunctionInfo;
-class SIRegisterInfo final : public AMDGPURegisterInfo {
+class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
private:
const GCNSubtarget &ST;
- unsigned SGPRSetID;
- unsigned VGPRSetID;
- unsigned AGPRSetID;
- BitVector SGPRPressureSets;
- BitVector VGPRPressureSets;
- BitVector AGPRPressureSets;
bool SpillSGPRToVGPR;
bool isWave32;
+ BitVector RegPressureIgnoredUnits;
+
+ /// Sub reg indexes for getRegSplitParts.
+ /// First index represents subreg size from 1 to 16 DWORDs.
+ /// The inner vector is sorted by bit offset.
+ /// Provided a register can be fully split with given subregs,
+ /// all elements of the inner vector combined give a full lane mask.
+ static std::array<std::vector<int16_t>, 16> RegSplitParts;
+
+ void reserveRegisterTuples(BitVector &, MCRegister Reg) const;
- void classifyPressureSet(unsigned PSetID, unsigned Reg,
- BitVector &PressureSets) const;
public:
SIRegisterInfo(const GCNSubtarget &ST);
+ /// \returns the sub reg enum value for the given \p Channel
+ /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+ static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
+
bool spillSGPRToVGPR() const {
return SpillSGPRToVGPR;
}
/// Return the end register initially reserved for the scratch buffer in case
/// spilling is needed.
- unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
-
- /// Return the end register initially reserved for the scratch wave offset in
- /// case spilling is needed.
- unsigned reservedPrivateSegmentWaveByteOffsetReg(
- const MachineFunction &MF) const;
+ MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
BitVector getReservedRegs(const MachineFunction &MF) const override;
@@ -70,6 +72,9 @@ public:
Register getFrameRegister(const MachineFunction &MF) const override;
+ bool hasBasePointer(const MachineFunction &MF) const;
+ Register getBaseRegister() const;
+
bool canRealignStack(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
@@ -77,7 +82,6 @@ public:
bool requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const override;
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
- bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
@@ -86,19 +90,24 @@ public:
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB,
- unsigned BaseReg, int FrameIdx,
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
+ int FrameIdx,
int64_t Offset) const override;
- void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
- bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
int64_t Offset) const override;
const TargetRegisterClass *getPointerRegClass(
const MachineFunction &MF, unsigned Kind = 0) const override;
+ void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
+ int Offset, unsigned EltSize, Register VGPR,
+ int64_t VGPRLanes, RegScavenger *RS,
+ bool IsLoad) const;
+
/// If \p OnlyToVGPR is true, this will only succeed if this
bool spillSGPR(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS,
@@ -115,15 +124,19 @@ public:
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
int FI, RegScavenger *RS) const;
- StringRef getRegAsmName(unsigned Reg) const override;
+ StringRef getRegAsmName(MCRegister Reg) const override;
- unsigned getHWRegIndex(unsigned Reg) const {
+ unsigned getHWRegIndex(MCRegister Reg) const {
return getEncodingValue(Reg) & 0xff;
}
+ static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth);
+ static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth);
+ static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
+
/// Return the 'base' register class for this register.
/// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
- const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+ const TargetRegisterClass *getPhysRegClass(MCRegister Reg) const;
/// \returns true if this class contains only SGPR registers
bool isSGPRClass(const TargetRegisterClass *RC) const {
@@ -135,9 +148,9 @@ public:
return isSGPRClass(getRegClass(RCID));
}
- bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const {
const TargetRegisterClass *RC;
- if (Register::isVirtualRegister(Reg))
+ if (Reg.isVirtual())
RC = MRI.getRegClass(Reg);
else
RC = getPhysRegClass(Reg);
@@ -161,16 +174,16 @@ public:
}
/// \returns A VGPR reg class with the same width as \p SRC
- const TargetRegisterClass *getEquivalentVGPRClass(
- const TargetRegisterClass *SRC) const;
+ const TargetRegisterClass *
+ getEquivalentVGPRClass(const TargetRegisterClass *SRC) const;
/// \returns An AGPR reg class with the same width as \p SRC
- const TargetRegisterClass *getEquivalentAGPRClass(
- const TargetRegisterClass *SRC) const;
+ const TargetRegisterClass *
+ getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
/// \returns A SGPR reg class with the same width as \p SRC
- const TargetRegisterClass *getEquivalentSGPRClass(
- const TargetRegisterClass *VRC) const;
+ const TargetRegisterClass *
+ getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
/// \returns The register class that is used for a sub-register of \p RC for
/// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will
@@ -196,38 +209,23 @@ public:
/// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
bool opCanUseInlineConstant(unsigned OpType) const;
- unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineFunction &MF) const;
-
- unsigned getSGPRPressureSet() const { return SGPRSetID; };
- unsigned getVGPRPressureSet() const { return VGPRSetID; };
- unsigned getAGPRPressureSet() const { return AGPRSetID; };
+ MCRegister findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC,
+ const MachineFunction &MF,
+ bool ReserveHighestVGPR = false) const;
const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
- unsigned Reg) const;
- bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
- bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
- bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ Register Reg) const;
+ bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const;
+ bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const;
+ bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const {
return isVGPR(MRI, Reg) || isAGPR(MRI, Reg);
}
- virtual bool
- isDivergentRegClass(const TargetRegisterClass *RC) const override {
- return !isSGPRClass(RC);
- }
+ bool isConstantPhysReg(MCRegister PhysReg) const override;
- bool isSGPRPressureSet(unsigned SetID) const {
- return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) &&
- !AGPRPressureSets.test(SetID);
- }
- bool isVGPRPressureSet(unsigned SetID) const {
- return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
- !AGPRPressureSets.test(SetID);
- }
- bool isAGPRPressureSet(unsigned SetID) const {
- return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
- !VGPRPressureSets.test(SetID);
+ bool isDivergentRegClass(const TargetRegisterClass *RC) const override {
+ return !isSGPRClass(RC);
}
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
@@ -249,7 +247,7 @@ public:
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
- unsigned getReturnAddressReg(const MachineFunction &MF) const;
+ MCRegister getReturnAddressReg(const MachineFunction &MF) const;
const TargetRegisterClass *
getRegClassForSizeOnBank(unsigned Size,
@@ -277,12 +275,12 @@ public:
: &AMDGPU::SReg_64_XEXECRegClass;
}
- unsigned getVCC() const;
+ MCRegister getVCC() const;
const TargetRegisterClass *getRegClass(unsigned RCID) const;
// Find reaching register definition
- MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
+ MachineInstr *findReachingDef(Register Reg, unsigned SubReg,
MachineInstr &Use,
MachineRegisterInfo &MRI,
LiveIntervals *LIS) const;
@@ -290,14 +288,51 @@ public:
const uint32_t *getAllVGPRRegMask() const;
const uint32_t *getAllAllocatableSRegMask() const;
+ // \returns number of 32 bit registers covered by a \p LM
+ static unsigned getNumCoveredRegs(LaneBitmask LM) {
+ // The assumption is that every lo16 subreg is an even bit and every hi16
+ // is an adjacent odd bit or vice versa.
+ uint64_t Mask = LM.getAsInteger();
+ uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL;
+ Mask = (Even >> 1) | Mask;
+ uint64_t Odd = Mask & 0x5555555555555555ULL;
+ return countPopulation(Odd);
+ }
+
+ // \returns a DWORD offset of a \p SubReg
+ unsigned getChannelFromSubReg(unsigned SubReg) const {
+ return SubReg ? (getSubRegIdxOffset(SubReg) + 31) / 32 : 0;
+ }
+
+ // \returns a DWORD size of a \p SubReg
+ unsigned getNumChannelsFromSubReg(unsigned SubReg) const {
+ return getNumCoveredRegs(getSubRegIndexLaneMask(SubReg));
+ }
+
+ // For a given 16 bit \p Reg \returns a 32 bit register holding it.
+ // \returns \p Reg otherwise.
+ MCPhysReg get32BitRegister(MCPhysReg Reg) const;
+
+ /// Return all SGPR128 which satisfy the waves per execution unit requirement
+ /// of the subtarget.
+ ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
+
+ /// Return all SGPR32 which satisfy the waves per execution unit requirement
+ /// of the subtarget.
+ ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
+
+ /// Return all VGPR32 which satisfy the waves per execution unit requirement
+ /// of the subtarget.
+ ArrayRef<MCPhysReg> getAllVGPR32(const MachineFunction &MF) const;
+
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
int Index,
- unsigned ValueReg,
+ Register ValueReg,
bool ValueIsKill,
- unsigned ScratchRsrcReg,
- unsigned ScratchOffsetReg,
+ MCRegister ScratchRsrcReg,
+ MCRegister ScratchOffsetReg,
int64_t InstrOffset,
MachineMemOperand *MMO,
RegScavenger *RS) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 6ea6ec00e742d..ff1f5c4bc49b1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -7,6 +7,50 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
+// Subregister declarations
+//===----------------------------------------------------------------------===//
+
+class Indexes<int N> {
+ list<int> all = [0, 1, 2, 3, 4, 5, 6 , 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31];
+
+ // Returns list of indexes [0..N)
+ list<int> slice =
+ !foldl([]<int>, all, acc, cur,
+ !listconcat(acc, !if(!lt(cur, N), [cur], [])));
+}
+
+let Namespace = "AMDGPU" in {
+
+def lo16 : SubRegIndex<16, 0>;
+def hi16 : SubRegIndex<16, 16>;
+
+foreach Index = 0-31 in {
+ def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+foreach Index = 1-31 in {
+ def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>;
+ def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>;
+}
+
+foreach Size = {2-6,8,16} in {
+ foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in {
+ def !foldl("", Indexes<Size>.slice, acc, cur,
+ !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
+ SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
+ let CoveringSubRegIndices =
+ !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur,
+ !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))]));
+ }
+ }
+}
+
+}
+
+//===----------------------------------------------------------------------===//
// Helpers
//===----------------------------------------------------------------------===//
@@ -15,6 +59,7 @@ class getSubRegs<int size> {
list<SubRegIndex> ret3 = [sub0, sub1, sub2];
list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
+ list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
sub4, sub5, sub6, sub7,
@@ -33,8 +78,10 @@ class getSubRegs<int size> {
!if(!eq(size, 3), ret3,
!if(!eq(size, 4), ret4,
!if(!eq(size, 5), ret5,
- !if(!eq(size, 8), ret8,
- !if(!eq(size, 16), ret16, ret32))))));
+ !if(!eq(size, 6), ret6,
+ !if(!eq(size, 8), ret8,
+ !if(!eq(size, 16), ret16,
+ ret32)))))));
}
// Generates list of sequential register tuple names.
@@ -74,39 +121,69 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
class SIReg <string n, bits<16> regIdx = 0> :
- Register<n>,
- DwarfRegNum<[!cast<int>(HWEncoding)]> {
+ Register<n> {
let Namespace = "AMDGPU";
-
- // This is the not yet the complete register encoding. An additional
- // bit is set for VGPRs.
let HWEncoding = regIdx;
}
+class SIRegWithSubRegs <string n, list<Register> subregs, bits<16> regIdx> :
+ RegisterWithSubRegs<n, subregs> {
+}
+
+multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
+ bit HWEncodingHigh = 0> {
+ // There is no special encoding for 16 bit subregs, these are not real
+ // registers but rather operands for instructions preserving other 16 bits
+ // of the result or reading just 16 bits of a 32 bit VGPR.
+ // It is encoded as a corresponding 32 bit register.
+ // Non-VGPR register classes use it as we need to have matching subregisters
+ // to move instructions and data between ALUs.
+ def _LO16 : SIReg<n#".l", regIdx> {
+ let HWEncoding{8} = HWEncodingHigh;
+ }
+ def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx> {
+ let isArtificial = ArtificialHigh;
+ let HWEncoding{8} = HWEncodingHigh;
+ }
+ def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"),
+ !cast<Register>(NAME#"_HI16")]> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [lo16, hi16];
+ let CoveredBySubRegs = !if(ArtificialHigh,0,1);
+ let HWEncoding = regIdx;
+ let HWEncoding{8} = HWEncodingHigh;
+ }
+}
+
// Special Registers
-def VCC_LO : SIReg<"vcc_lo", 106>;
-def VCC_HI : SIReg<"vcc_hi", 107>;
+defm VCC_LO : SIRegLoHi16<"vcc_lo", 106>;
+defm VCC_HI : SIRegLoHi16<"vcc_hi", 107>;
// Pseudo-registers: Used as placeholders during isel and immediately
// replaced, never seeing the verifier.
def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>;
def FP_REG : SIReg<"fp", 0>;
def SP_REG : SIReg<"sp", 0>;
-def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
+
+// Pseudo-register to represent the program-counter DWARF register.
+def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 16]> {
+ // There is no physical register corresponding to a "program counter", but
+ // we need to encode the concept in debug information in order to represent
+ // things like the return value in unwind information.
+ let isArtificial = 1;
+}
// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
- DwarfRegAlias<VCC_LO> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 106;
}
-def EXEC_LO : SIReg<"exec_lo", 126>;
-def EXEC_HI : SIReg<"exec_hi", 127>;
+defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>;
+defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>;
-def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
- DwarfRegAlias<EXEC_LO> {
+def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 126;
@@ -114,71 +191,76 @@ def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
// 32-bit real registers, for MC only.
// May be used with both 32-bit and 64-bit operands.
-def SRC_VCCZ : SIReg<"src_vccz", 251>;
-def SRC_EXECZ : SIReg<"src_execz", 252>;
-def SRC_SCC : SIReg<"src_scc", 253>;
+defm SRC_VCCZ : SIRegLoHi16<"src_vccz", 251>;
+defm SRC_EXECZ : SIRegLoHi16<"src_execz", 252>;
+defm SRC_SCC : SIRegLoHi16<"src_scc", 253>;
// 1-bit pseudo register, for codegen only.
// Should never be emitted.
def SCC : SIReg<"scc">;
-def M0 : SIReg <"m0", 124>;
-def SGPR_NULL : SIReg<"null", 125>;
+defm M0 : SIRegLoHi16 <"m0", 124>;
+defm SGPR_NULL : SIRegLoHi16 <"null", 125>;
-def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
-def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
-def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
-def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
-def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>;
+defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>;
+defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>;
+defm SRC_PRIVATE_BASE : SIRegLoHi16<"src_private_base", 237>;
+defm SRC_PRIVATE_LIMIT : SIRegLoHi16<"src_private_limit", 238>;
+defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>;
-def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
+// Not addressable
+def MODE : SIReg <"mode", 0>;
-def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
-def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
+def LDS_DIRECT : SIReg <"src_lds_direct", 254> {
+ // There is no physical register corresponding to this. This is an
+ // encoding value in a source field, which will ultimately trigger a
+ // read from m0.
+ let isArtificial = 1;
+}
-def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
- DwarfRegAlias<XNACK_MASK_LO> {
+defm XNACK_MASK_LO : SIRegLoHi16<"xnack_mask_lo", 104>;
+defm XNACK_MASK_HI : SIRegLoHi16<"xnack_mask_hi", 105>;
+
+def XNACK_MASK :
+ RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 104;
}
// Trap handler registers
-def TBA_LO : SIReg<"tba_lo", 108>;
-def TBA_HI : SIReg<"tba_hi", 109>;
+defm TBA_LO : SIRegLoHi16<"tba_lo", 108>;
+defm TBA_HI : SIRegLoHi16<"tba_hi", 109>;
-def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
- DwarfRegAlias<TBA_LO> {
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 108;
}
-def TMA_LO : SIReg<"tma_lo", 110>;
-def TMA_HI : SIReg<"tma_hi", 111>;
+defm TMA_LO : SIRegLoHi16<"tma_lo", 110>;
+defm TMA_HI : SIRegLoHi16<"tma_hi", 111>;
-def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
- DwarfRegAlias<TMA_LO> {
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 110;
}
foreach Index = 0-15 in {
- def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>;
- def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>;
- def TTMP#Index : SIReg<"ttmp"#Index, 0>;
+ defm TTMP#Index#_vi : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>;
+ defm TTMP#Index#_gfx9_gfx10 : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>;
+ defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>;
}
multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
- def _ci : SIReg<n, ci_e>;
- def _vi : SIReg<n, vi_e>;
- def "" : SIReg<n, 0>;
+ defm _ci : SIRegLoHi16<n, ci_e>;
+ defm _vi : SIRegLoHi16<n, vi_e>;
+ defm "" : SIRegLoHi16<n, 0>;
}
class FlatReg <Register lo, Register hi, bits<16> encoding> :
- RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
- DwarfRegAlias<lo> {
+ RegisterWithSubRegs<"flat_scratch", [lo, hi]> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = encoding;
@@ -193,21 +275,24 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
// SGPR registers
foreach Index = 0-105 in {
- def SGPR#Index : SIReg <"s"#Index, Index>;
+ defm SGPR#Index :
+ SIRegLoHi16 <"s"#Index, Index>,
+ DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)),
+ !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>;
}
// VGPR registers
foreach Index = 0-255 in {
- def VGPR#Index : SIReg <"v"#Index, Index> {
- let HWEncoding{8} = 1;
- }
+ defm VGPR#Index :
+ SIRegLoHi16 <"v"#Index, Index, 0, 1>,
+ DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>;
}
// AccVGPR registers
foreach Index = 0-255 in {
- def AGPR#Index : SIReg <"a"#Index, Index> {
- let HWEncoding{8} = 1;
- }
+ defm AGPR#Index :
+ SIRegLoHi16 <"a"#Index, Index, 1, 1>,
+ DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>;
}
//===----------------------------------------------------------------------===//
@@ -224,14 +309,35 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
let isAllocatable = 0;
}
+def M0_CLASS_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
+ let CopyCost = 1;
+ let Size = 16;
+ let isAllocatable = 0;
+}
+
// TODO: Do we need to set DwarfRegAlias on register tuples?
+def SGPR_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add (sequence "SGPR%u_LO16", 0, 105))> {
+ let AllocationPriority = 9;
+ let Size = 16;
+ let GeneratePressureSet = 0;
+}
+
+def SGPR_HI16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add (sequence "SGPR%u_HI16", 0, 105))> {
+ let isAllocatable = 0;
+ let Size = 16;
+ let GeneratePressureSet = 0;
+}
+
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
let AllocationPriority = 9;
+ let GeneratePressureSet = 0;
}
// SGPR 64-bit registers
@@ -246,6 +352,9 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
+// SGPR 192-bit registers
+def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
+
// SGPR 256-bit registers
def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
@@ -261,6 +370,12 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
let isAllocatable = 0;
}
+def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add (sequence "TTMP%u_LO16", 0, 15))> {
+ let Size = 16;
+ let isAllocatable = 0;
+}
+
// Trap handler TMP 64-bit registers
def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
@@ -357,6 +472,19 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+def VGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add (sequence "VGPR%u_LO16", 0, 255))> {
+ let AllocationPriority = 1;
+ let Size = 16;
+ let GeneratePressureSet = 0;
+}
+
+def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add (sequence "VGPR%u_HI16", 0, 255))> {
+ let AllocationPriority = 1;
+ let Size = 16;
+ let GeneratePressureSet = 0;
+}
// VGPR 32-bit registers
// i16/f16 only on VI+
@@ -364,6 +492,7 @@ def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.t
(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
+ let Weight = 1;
}
// VGPR 64-bit registers
@@ -378,6 +507,9 @@ def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
// VGPR 160-bit registers
def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
+// VGPR 192-bit registers
+def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+
// VGPR 256-bit registers
def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
@@ -387,19 +519,39 @@ def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
// VGPR 1024-bit registers
def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
+def AGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add (sequence "AGPR%u_LO16", 0, 255))> {
+ let isAllocatable = 0;
+ let Size = 16;
+ let GeneratePressureSet = 0;
+}
+
// AccVGPR 32-bit registers
def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add (sequence "AGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
+ let Weight = 1;
}
// AGPR 64-bit registers
def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
+// AGPR 96-bit registers
+def AGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, AGPR_32, 255, 1, 3, "a">;
+
// AGPR 128-bit registers
def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">;
+// AGPR 160-bit registers
+def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">;
+
+// AGPR 192-bit registers
+def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
+
+// AGPR 256-bit registers
+def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
+
// AGPR 512-bit registers
def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
@@ -411,7 +563,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
//===----------------------------------------------------------------------===//
def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+ (add FP_REG, SP_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
@@ -422,12 +574,13 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
let CopyCost = -1;
}
-def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
(add LDS_DIRECT)> {
let isAllocatable = 0;
let CopyCost = -1;
}
+let GeneratePressureSet = 0 in {
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
@@ -438,24 +591,54 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
let AllocationPriority = 10;
}
+def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
+ XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16,
+ TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
+ SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16,
+ SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> {
+ let Size = 16;
+ let AllocationPriority = 10;
+}
+
def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
let AllocationPriority = 10;
}
+def SReg_LO16_XEXEC_HI : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> {
+ let Size = 16;
+ let AllocationPriority = 10;
+}
+
def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 10;
}
+def SReg_LO16_XM0 : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> {
+ let Size = 16;
+ let AllocationPriority = 10;
+}
+
+def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+ (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> {
+ let Size = 16;
+ let AllocationPriority = 10;
+}
+} // End GeneratePressureSet = 0
+
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 10;
}
-def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> {
+let GeneratePressureSet = 0 in {
+def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
@@ -528,7 +711,6 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
(add SGPR_128, TTMP_128)> {
- let AllocationPriority = 15;
let isAllocatable = 0;
}
@@ -543,39 +725,50 @@ def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
(add SGPR_160)> {
- let AllocationPriority = 16;
+ // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
+ // subclasses of SGPR_160 to be marked unallocatable too.
}
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
+def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
+ let Size = 192;
let AllocationPriority = 17;
}
-def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
+def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
+ let Size = 192;
+ let isAllocatable = 0;
+}
+
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
+ let AllocationPriority = 18;
+}
+
+def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
let isAllocatable = 0;
}
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
(add SGPR_256, TTMP_256)> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
- let AllocationPriority = 17;
+ let isAllocatable = 0;
}
-def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
(add SGPR_512Regs)> {
- let AllocationPriority = 18;
+ let AllocationPriority = 19;
}
-def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
(add TTMP_512Regs)> {
let isAllocatable = 0;
}
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
(add SGPR_512, TTMP_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
- let AllocationPriority = 18;
+ let isAllocatable = 0;
}
def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -583,105 +776,55 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3
let isAllocatable = 0;
}
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
(add SGPR_1024Regs)> {
- let AllocationPriority = 19;
+ let AllocationPriority = 20;
}
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
(add SGPR_1024)> {
let CopyCost = 16;
- let AllocationPriority = 19;
-}
-
-// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32,
- (add VGPR_64)> {
- let Size = 64;
-
- // Requires 2 v_mov_b32 to copy
- let CopyCost = 2;
- let AllocationPriority = 2;
-}
-
-def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
- let Size = 96;
-
- // Requires 3 v_mov_b32 to copy
- let CopyCost = 3;
- let AllocationPriority = 3;
-}
-
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add VGPR_128)> {
- let Size = 128;
-
- // Requires 4 v_mov_b32 to copy
- let CopyCost = 4;
- let AllocationPriority = 4;
-}
-
-def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add VGPR_160)> {
- let Size = 160;
-
- // Requires 5 v_mov_b32 to copy
- let CopyCost = 5;
- let AllocationPriority = 5;
-}
-
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add VGPR_256)> {
- let Size = 256;
- let CopyCost = 8;
- let AllocationPriority = 6;
-}
-
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add VGPR_512)> {
- let Size = 512;
- let CopyCost = 16;
- let AllocationPriority = 7;
-}
-
-def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add VGPR_1024)> {
- let Size = 1024;
- let CopyCost = 32;
- let AllocationPriority = 8;
+ let isAllocatable = 0;
}
-def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
- (add AGPR_64)> {
- let Size = 64;
+// Register class for all vector registers (VGPRs + Interpolation Registers)
+class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+ RegisterClass<"AMDGPU", regTypes, 32, regList> {
+ let Size = !mul(numRegs, 32);
- let CopyCost = 5;
- let AllocationPriority = 2;
+ // Requires n v_mov_b32 to copy
+ let CopyCost = numRegs;
+ let AllocationPriority = numRegs;
+ let Weight = numRegs;
}
-def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add AGPR_128)> {
- let Size = 128;
+def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+ (add VGPR_64)>;
+def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
+def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, i128], (add VGPR_128)>;
+def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
+def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
+def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
- // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
- let CopyCost = 9;
- let AllocationPriority = 4;
+class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+ VRegClass<numRegs, regTypes, regList> {
+ // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr
+ let CopyCost = !add(numRegs, numRegs, 1);
}
-def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add AGPR_512)> {
- let Size = 512;
- let CopyCost = 33;
- let AllocationPriority = 7;
-}
-
-def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add AGPR_1024)> {
- let Size = 1024;
- let CopyCost = 65;
- let AllocationPriority = 8;
-}
+def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
+ (add AGPR_64)>;
+def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
+def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
+def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
+def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
+def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
+} // End GeneratePressureSet = 0
// This is not a real register. This is just to have a register to add
// to VReg_1 that does not alias any real register that would
@@ -690,6 +833,7 @@ def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> {
let isArtificial = 1;
}
+let GeneratePressureSet = 0 in {
// FIXME: Should specify an empty set for this. No register should
// ever be allocated using VReg_1. This is a hack for SelectionDAG
// that should always be lowered by SILowerI1Copies. TableGen crashes
@@ -718,6 +862,7 @@ def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
(add AReg_64, VReg_64)> {
let isAllocatable = 0;
}
+} // End GeneratePressureSet = 0
//===----------------------------------------------------------------------===//
// Register operands
diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
index 51779e97ac620..64fca0b467977 100644
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
@@ -88,15 +88,17 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch(
for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
// When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+ // leaving the loop might never be taken when EXEC = 0.
+ // Hence we should retain cbranch out of the loop lest it become infinite.
+ if (I->isConditionalBranch())
return true;
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
return true;
+ if (TII->isKillTerminator(I->getOpcode()))
+ return true;
+
// These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
I->getOpcode() == AMDGPU::S_WAITCNT)
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 824d1aeb0df9b..932381c99e0b0 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -1,4 +1,4 @@
-//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//===-- SISchedule.td - SI Scheduling definitions -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -27,10 +27,14 @@ def WriteBarrier : SchedWrite;
def MIVGPRRead : SchedRead;
def MIMFMARead : SchedRead;
-// Vector ALU instructions
+// Normal 16 or 32 bit VALU instructions
def Write32Bit : SchedWrite;
+// Conversion to or from F32 (but not converting F64 to or from F32)
+def WriteFloatCvt : SchedWrite;
+// F16 or F32 transcendental instructions (these are quarter rate)
+def WriteTrans32 : SchedWrite;
+// Other quarter rate VALU instructions
def WriteQuarterRate32 : SchedWrite;
-def WriteFullOrQuarterRate32 : SchedWrite;
def WriteFloatFMA : SchedWrite;
@@ -43,6 +47,10 @@ def WriteDoubleAdd : SchedWrite;
// Conversion to or from f64 instruction
def WriteDoubleCvt : SchedWrite;
+// F64 "transcendental" (actually only reciprocal and/or square root)
+// instructions
+def WriteTrans64 : SchedWrite;
+
// Half rate 64-bit instructions.
def Write64Bit : SchedWrite;
@@ -56,7 +64,7 @@ def Write16PassMAI : SchedWrite;
// instructions)
class SISchedMachineModel : SchedMachineModel {
- let CompleteModel = 0;
+ let CompleteModel = 1;
// MicroOpBufferSize = 1 means that instructions will always be added
// the ready queue when they become available. This exposes them
// to the register pressure analysis.
@@ -127,6 +135,8 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write32Bit, 1>;
def : HWVALUWriteRes<Write64Bit, 2>;
+ def : HWVALUWriteRes<WriteFloatCvt, 4>;
+ def : HWVALUWriteRes<WriteTrans32, 4>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
def : HWVALUWriteRes<Write2PassMAI, 2>;
def : HWVALUWriteRes<Write8PassMAI, 8>;
@@ -135,9 +145,9 @@ multiclass SICommonWriteRes {
def : ReadAdvance<MIVGPRRead, -2>;
def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
- // Technicaly mfma reads can be from 0 to 4 cycles but that does not make
+ // Technically mfma reads can be from 0 to 4 cycles but that does not make
// sense to model because its register setup is huge. In particular if we
- // properly model read advanice as -2 for a vgpr read it will result in a
+ // properly model read advance as -2 for a vgpr read it will result in a
// bad scheduling of acc writes before that mfma. To avoid it we would
// need to consume 2 or 4 more vgprs to be initialized before the acc
// write sequence. Just assume worst case here.
@@ -163,6 +173,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 4>;
def : HWVALUWriteRes<WriteDoubleAdd, 2>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64, 4>;
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -176,6 +187,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>;
def : HWVALUWriteRes<WriteDouble, 16>;
def : HWVALUWriteRes<WriteDoubleAdd, 8>;
def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64, 16>;
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -186,17 +198,20 @@ let SchedModel = GFX10SpeedModel in {
// The latency values are 1 / (operations / cycle).
// Add 1 stall cycle for VGPR read.
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
-def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>;
-def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
-def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>;
-def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>;
-def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 22>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 22>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 22>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 24>;
def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
-def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 3986ca6dfa813..9c6833a7dab61 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -185,6 +185,11 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
if (!MI.getOperand(0).isReg())
TII->commuteInstruction(MI, false, 0, 1);
+ // cmpk requires src0 to be a register
+ const MachineOperand &Src0 = MI.getOperand(0);
+ if (!Src0.isReg())
+ return;
+
const MachineOperand &Src1 = MI.getOperand(1);
if (!Src1.isImm())
return;
@@ -220,7 +225,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
- if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+ if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
return;
MachineFunction *MF = MI.getParent()->getParent();
@@ -323,60 +328,61 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
MachineOperand *SrcReg = Src0;
MachineOperand *SrcImm = Src1;
- if (SrcImm->isImm() &&
- !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
- uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
- uint32_t NewImm = 0;
-
- if (Opc == AMDGPU::S_AND_B32) {
- if (isPowerOf2_32(~Imm)) {
- NewImm = countTrailingOnes(Imm);
- Opc = AMDGPU::S_BITSET0_B32;
- } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
- NewImm = ~Imm;
- Opc = AMDGPU::S_ANDN2_B32;
- }
- } else if (Opc == AMDGPU::S_OR_B32) {
- if (isPowerOf2_32(Imm)) {
- NewImm = countTrailingZeros(Imm);
- Opc = AMDGPU::S_BITSET1_B32;
- } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
- NewImm = ~Imm;
- Opc = AMDGPU::S_ORN2_B32;
- }
- } else if (Opc == AMDGPU::S_XOR_B32) {
- if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
- NewImm = ~Imm;
- Opc = AMDGPU::S_XNOR_B32;
- }
- } else {
- llvm_unreachable("unexpected opcode");
- }
+ if (!SrcImm->isImm() ||
+ AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm()))
+ return false;
+
+ uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+ uint32_t NewImm = 0;
- if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
- SrcImm == Src0) {
- if (!TII->commuteInstruction(MI, false, 1, 2))
- NewImm = 0;
+ if (Opc == AMDGPU::S_AND_B32) {
+ if (isPowerOf2_32(~Imm)) {
+ NewImm = countTrailingOnes(Imm);
+ Opc = AMDGPU::S_BITSET0_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ANDN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_OR_B32) {
+ if (isPowerOf2_32(Imm)) {
+ NewImm = countTrailingZeros(Imm);
+ Opc = AMDGPU::S_BITSET1_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ORN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_XOR_B32) {
+ if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_XNOR_B32;
}
+ } else {
+ llvm_unreachable("unexpected opcode");
+ }
- if (NewImm != 0) {
- if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
- MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
- MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
- return true;
- }
+ if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+ SrcImm == Src0) {
+ if (!TII->commuteInstruction(MI, false, 1, 2))
+ NewImm = 0;
+ }
- if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
- MI.setDesc(TII->get(Opc));
- if (Opc == AMDGPU::S_BITSET0_B32 ||
- Opc == AMDGPU::S_BITSET1_B32) {
- Src0->ChangeToImmediate(NewImm);
- // Remove the immediate and add the tied input.
- MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
- MI.tieOperands(0, 2);
- } else {
- SrcImm->setImm(NewImm);
- }
+ if (NewImm != 0) {
+ if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
+ MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+ MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ return true;
+ }
+
+ if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+ MI.setDesc(TII->get(Opc));
+ if (Opc == AMDGPU::S_BITSET0_B32 ||
+ Opc == AMDGPU::S_BITSET1_B32) {
+ Src0->ChangeToImmediate(NewImm);
+ // Remove the immediate and add the tied input.
+ MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+ MI.tieOperands(0, 2);
+ } else {
+ SrcImm->setImm(NewImm);
}
}
}
@@ -426,8 +432,7 @@ getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
if (Register::isPhysicalRegister(Reg)) {
Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
} else {
- LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
- Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
+ Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
}
}
return TargetInstrInfo::RegSubRegPair(Reg, Sub);
@@ -472,26 +477,30 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
if (!TRI.isVGPR(MRI, X))
return nullptr;
- for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
- if (YTop.getSubReg() != Tsub)
- continue;
-
- MachineInstr &MovY = *YTop.getParent();
- if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
- MovY.getOpcode() != AMDGPU::COPY) ||
- MovY.getOperand(1).getSubReg() != Tsub)
+ const unsigned SearchLimit = 16;
+ unsigned Count = 0;
+ for (auto Iter = std::next(MovT.getIterator()),
+ E = MovT.getParent()->instr_end();
+ Iter != E && Count < SearchLimit; ++Iter, ++Count) {
+
+ MachineInstr *MovY = &*Iter;
+ if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ MovY->getOpcode() != AMDGPU::COPY) ||
+ !MovY->getOperand(1).isReg() ||
+ MovY->getOperand(1).getReg() != T ||
+ MovY->getOperand(1).getSubReg() != Tsub)
continue;
- Register Y = MovY.getOperand(0).getReg();
- unsigned Ysub = MovY.getOperand(0).getSubReg();
+ Register Y = MovY->getOperand(0).getReg();
+ unsigned Ysub = MovY->getOperand(0).getSubReg();
- if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
+ if (!TRI.isVGPR(MRI, Y))
continue;
MachineInstr *MovX = nullptr;
- auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
- for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
- if (instReadsReg(&*I, X, Xsub, TRI) ||
+ for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
+ I != IY; ++I) {
+ if (instReadsReg(&*I, X, Xsub, TRI) ||
instModifiesReg(&*I, Y, Ysub, TRI) ||
instModifiesReg(&*I, T, Tsub, TRI) ||
(MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
@@ -516,7 +525,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
MovX = &*I;
}
- if (!MovX || I == E)
+ if (!MovX)
continue;
LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
@@ -533,7 +542,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
.addReg(X1.Reg, 0, X1.SubReg).getInstr();
}
MovX->eraseFromParent();
- MovY.eraseFromParent();
+ MovY->eraseFromParent();
MachineInstr *Next = &*std::next(MovT.getIterator());
if (MRI.use_nodbg_empty(T))
MovT.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 39f5df767977e..b1c73df269fb2 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -61,6 +61,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -154,7 +155,7 @@ private:
LiveIntervals *LIS;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
- DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+ MapVector<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
@@ -170,8 +171,6 @@ private:
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);
- bool requiresCorrectState(const MachineInstr &MI) const;
-
MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before);
MachineBasicBlock::iterator
@@ -525,36 +524,6 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
return GlobalFlags;
}
-/// Whether \p MI really requires the exec state computed during analysis.
-///
-/// Scalar instructions must occasionally be marked WQM for correct propagation
-/// (e.g. thread masks leading up to branches), but when it comes to actual
-/// execution, they don't care about EXEC.
-bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
- if (MI.isTerminator())
- return true;
-
- // Skip instructions that are not affected by EXEC
- if (TII->isScalarUnit(MI))
- return false;
-
- // Generic instructions such as COPY will either disappear by register
- // coalescing or be lowered to SALU or VALU instructions.
- if (MI.isTransient()) {
- if (MI.getNumExplicitOperands() >= 1) {
- const MachineOperand &Op = MI.getOperand(0);
- if (Op.isReg()) {
- if (TRI->isSGPRReg(*MRI, Op.getReg())) {
- // SGPR instructions are not affected by EXEC
- return false;
- }
- }
- }
- }
-
- return true;
-}
-
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
@@ -741,7 +710,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (II != IE) {
MachineInstr &MI = *II;
- if (requiresCorrectState(MI)) {
+ if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
if (III->second.Needs & StateWWM)
@@ -793,18 +762,23 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (State == StateWWM) {
assert(SavedNonWWMReg);
fromWWM(MBB, Before, SavedNonWWMReg);
+ LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
+ SavedNonWWMReg = 0;
State = NonWWMState;
}
if (Needs == StateWWM) {
NonWWMState = State;
+ assert(!SavedNonWWMReg);
SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
toWWM(MBB, Before, SavedNonWWMReg);
State = StateWWM;
} else {
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
- if (!WQMFromExec && (OutNeeds & StateWQM))
+ if (!WQMFromExec && (OutNeeds & StateWQM)) {
+ assert(!SavedWQMReg);
SavedWQMReg = MRI->createVirtualRegister(BoolRC);
+ }
toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
State = StateExact;
@@ -837,6 +811,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
break;
II = Next;
}
+ assert(!SavedWQMReg);
+ assert(!SavedNonWWMReg);
}
void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
@@ -929,10 +905,12 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
- BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
+ auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
+ TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
+ : AMDGPU::S_WQM_B64),
+ Exec)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
lowerCopyInstrs();
// EntryMI may become invalid here
@@ -948,6 +926,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
for (auto BII : Blocks)
processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+ if (LiveMaskReg)
+ LIS->createAndComputeVirtRegInterval(LiveMaskReg);
+
// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining
// the analysis results.
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 79982d96c2c8e..70bf215c03f3f 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1,4 +1,4 @@
-//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
+//===---- SMInstructions.td - Scalar Memory Instruction Definitions -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -11,9 +11,11 @@ def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
let OperandType = "OPERAND_IMMEDIATE";
}
-def smrd_offset_20 : NamedOperandU32<"SMRDOffset20",
- NamedMatchClass<"SMRDOffset20">> {
+def smem_offset : NamedOperandU32<"SMEMOffset",
+ NamedMatchClass<"SMEMOffset">> {
let OperandType = "OPERAND_IMMEDIATE";
+ let EncoderMethod = "getSMEMOffsetEncoding";
+ let DecoderMethod = "decodeSMEMOffset";
}
//===----------------------------------------------------------------------===//
@@ -43,6 +45,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
bit has_dlc = 0;
bits<1> has_offset = 1;
bits<1> offset_is_imm = 0;
+ bit is_buffer = 0;
}
class SM_Real <SM_Pseudo ps>
@@ -51,9 +54,15 @@ class SM_Real <SM_Pseudo ps>
let isPseudo = 0;
let isCodeGenOnly = 0;
+ Instruction Opcode = !cast<Instruction>(NAME);
+
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SMRD = ps.SMRD;
+
+ bit is_buffer = ps.is_buffer;
// encoding
bits<7> sbase;
@@ -153,7 +162,7 @@ multiclass SM_Pseudo_Stores<string opName,
}
multiclass SM_Pseudo_Discards<string opName> {
- def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>;
+ def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smem_offset:$offset), 1>;
def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
}
@@ -178,14 +187,14 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
opName, (outs), (ins), "", [(node)]> {
let hasSideEffects = 1;
- let mayStore = 1;
+ let mayStore = 0;
let has_sdst = 0;
let has_sbase = 0;
let has_offset = 0;
}
multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
- def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>;
+ def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smem_offset:$offset), 1>;
def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
}
@@ -228,7 +237,7 @@ class SM_Pseudo_Atomic<string opName,
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
!if(isImm,
- (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset, DLC:$dlc),
+ (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, DLC:$dlc),
(ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)),
!if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc",
isRet> {
@@ -266,6 +275,7 @@ defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>;
defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>;
defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>;
+let is_buffer = 1 in {
defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <
"s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC
>;
@@ -287,12 +297,14 @@ defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <
defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
"s_buffer_load_dwordx16", SReg_128, SReg_512
>;
+}
let SubtargetPredicate = HasScalarStores in {
defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
+let is_buffer = 1 in {
defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <
"s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC
>;
@@ -304,8 +316,10 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
"s_buffer_store_dwordx4", SReg_128, SReg_128
>;
+}
} // End SubtargetPredicate = HasScalarStores
+let SubtargetPredicate = HasSMemTimeInst in
def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
@@ -321,13 +335,16 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb
def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+let is_buffer = 1 in {
defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+}
} // SubtargetPredicate = isGFX8Plus
-let SubtargetPredicate = isGFX10Plus in {
+let SubtargetPredicate = isGFX10Plus in
def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
+let SubtargetPredicate = HasGetWaveIdInst in
def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>;
-} // End SubtargetPredicate = isGFX10Plus
+
let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in {
defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
@@ -341,6 +358,7 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg
let SubtargetPredicate = HasScalarAtomics in {
+let is_buffer = 1 in {
defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
@@ -368,6 +386,7 @@ defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2",
defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+}
defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
@@ -481,14 +500,17 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
let Inst{17} = imm;
let Inst{25-18} = op;
let Inst{31-26} = 0x30; //encoding
- let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?);
+
+ // VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed.
+ // Offset value is corrected accordingly when offset is encoded/decoded.
+ let Inst{52-32} = !if(ps.has_offset, offset{20-0}, ?);
}
multiclass SM_Real_Loads_vi<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_vi : SMEM_Real_vi <op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
@@ -509,7 +531,7 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
@@ -665,12 +687,10 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc);
let LGKM_CNT = ps.LGKM_CNT;
- let SMRD = ps.SMRD;
let mayLoad = ps.mayLoad;
let mayStore = ps.mayStore;
let hasSideEffects = ps.hasSideEffects;
let SchedRW = ps.SchedRW;
- let UseNamedOperandTable = ps.UseNamedOperandTable;
let Inst{7-0} = 0xff;
let Inst{8} = 0;
@@ -768,23 +788,26 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// 1. Offset as an immediate
def : GCNPat <
- (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc),
- (as_i1imm $dlc)))
- >;
+ (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_glc $cachepolicy),
+ (extract_dlc $cachepolicy)))> {
+ let AddedComplexity = 2;
+ }
// 2. 32-bit IMM offset on CI
def : GCNPat <
- (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)),
- (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> {
+ (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
+ (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
+ (extract_glc $cachepolicy), (extract_dlc $cachepolicy))> {
let OtherPredicates = [isGFX7Only];
+ let AddedComplexity = 1;
}
// 3. Offset loaded in an 32bit SGPR
def : GCNPat <
- (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc),
- (as_i1imm $dlc)))
+ (SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_glc $cachepolicy),
+ (extract_dlc $cachepolicy)))
>;
}
@@ -805,8 +828,13 @@ foreach vt = SReg_128.RegTypes in {
defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
}
-defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+foreach vt = SReg_256.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", vt>;
+}
+
+foreach vt = SReg_512.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
+}
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
@@ -821,10 +849,21 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
} // End let AddedComplexity = 100
+let OtherPredicates = [HasSMemTimeInst] in {
def : GCNPat <
(i64 (readcyclecounter)),
(S_MEMTIME)
>;
+} // let OtherPredicates = [HasSMemTimeInst]
+
+let OtherPredicates = [HasNoSMemTimeInst] in {
+def : GCNPat <
+ (i64 (readcyclecounter)),
+ (REG_SEQUENCE SReg_64,
+ (S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+} // let OtherPredicates = [HasNoSMemTimeInst]
//===----------------------------------------------------------------------===//
// GFX10.
@@ -844,7 +883,7 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
let Inst{16} = !if(ps.has_glc, glc, ?);
let Inst{25-18} = op;
let Inst{31-26} = 0x3d;
- let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?);
+ let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?);
let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding),
!if(ps.has_offset, offset{6-0}, ?));
}
@@ -853,7 +892,7 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
- let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
@@ -873,7 +912,7 @@ multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
// FIXME: The operand name $offset is inconsistent with $soff used
// in the pseudo
def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+ let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
}
def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
@@ -1020,3 +1059,12 @@ defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">;
} // End SubtargetPredicate = HasScalarAtomics
+
+def SMInfoTable : GenericTable {
+ let FilterClass = "SM_Real";
+ let CppTypeName = "SMInfo";
+ let Fields = ["Opcode", "is_buffer"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getSMEMOpcodeHelper";
+}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 73ba2ae367f7b..9d7b25d552170 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1,4 +1,4 @@
-//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
+//===-- SOPInstructions.td - SOP Instruction Definitions ------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -97,6 +97,17 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
let has_sdst = 0;
}
+// Special case for movreld where sdst is treated as a use operand.
+class SOP1_32_movreld <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SReg_32:$sdst, SSrc_b32:$src0),
+ "$sdst, $src0", pattern>;
+
+// Special case for movreld where sdst is treated as a use operand.
+class SOP1_64_movreld <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+ opName, (outs), (ins SReg_64:$sdst, SSrc_b64:$src0),
+ "$sdst, $src0", pattern
+>;
+
class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo <
opName, (outs), (ins SReg_32:$src0),
"$src0", pattern> {
@@ -199,7 +210,9 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
-def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
+ [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+>;
def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
[(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
@@ -209,7 +222,9 @@ def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
[(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
>;
-def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">;
+def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
+ [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+>;
def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
[(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
>;
@@ -267,8 +282,8 @@ def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
let Uses = [M0] in {
def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
def S_MOVRELS_B64 : SOP1_64R <"s_movrels_b64">;
-def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
-def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
+def S_MOVRELD_B32 : SOP1_32_movreld <"s_movreld_b32">;
+def S_MOVRELD_B64 : SOP1_64_movreld <"s_movreld_b64">;
} // End Uses = [M0]
let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in {
@@ -283,8 +298,8 @@ def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
let SubtargetPredicate = HasVGPRIndexMode in {
def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
- let Uses = [M0];
- let Defs = [M0];
+ let Uses = [M0, MODE];
+ let Defs = [M0, MODE];
}
}
@@ -401,8 +416,14 @@ class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
class UniformBinFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0, node:$src1),
(Op $src0, $src1),
- [{ return !N->isDivergent(); }]
->;
+ [{ return !N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+}
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
@@ -444,9 +465,19 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
} // End isCommutable = 1
} // End Defs = [SCC]
+class SelectPat<SDPatternOperator select> : PatFrag <
+ (ops node:$src1, node:$src2),
+ (select SCC, $src1, $src2),
+ [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
+>;
let Uses = [SCC] in {
- def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
+ let AddedComplexity = 20 in {
+ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
+ [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))]
+ >;
+ }
+
def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]
@@ -524,22 +555,22 @@ let AddedComplexity = 1 in {
let Defs = [SCC] in {
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
- [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+ [(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
- [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+ [(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
- [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+ [(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
- [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+ [(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
- [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+ [(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
- [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+ [(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
} // End Defs = [SCC]
@@ -592,14 +623,26 @@ let SubtargetPredicate = isGFX9Plus in {
def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
let Defs = [SCC] in {
- def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">;
- def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">;
- def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">;
- def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
+ def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32",
+ [(set i32:$sdst, (shl1_add SSrc_b32:$src0, SSrc_b32:$src1))]
+ >;
+ def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32",
+ [(set i32:$sdst, (shl2_add SSrc_b32:$src0, SSrc_b32:$src1))]
+ >;
+ def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32",
+ [(set i32:$sdst, (shl3_add SSrc_b32:$src0, SSrc_b32:$src1))]
+ >;
+ def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32",
+ [(set i32:$sdst, (shl4_add SSrc_b32:$src0, SSrc_b32:$src1))]
+ >;
} // End Defs = [SCC]
- def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
- def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
+ let isCommutable = 1 in {
+ def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32",
+ [(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>;
+ def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32",
+ [(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>;
+ }
} // End SubtargetPredicate = isGFX9Plus
//===----------------------------------------------------------------------===//
@@ -760,7 +803,11 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
"$sdst, $simm16"
>;
+let hasSideEffects = 1 in {
+
let mayLoad = 1 in {
+// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
+// its use in the readcyclecounter selection.
def S_GETREG_B32 : SOPK_Pseudo <
"s_getreg_b32",
(outs SReg_32:$sdst), (ins hwreg:$simm16),
@@ -768,14 +815,20 @@ def S_GETREG_B32 : SOPK_Pseudo <
>;
}
-let hasSideEffects = 1 in {
+let mayLoad = 0, mayStore =0 in {
def S_SETREG_B32 : SOPK_Pseudo <
"s_setreg_b32",
(outs), (ins SReg_32:$sdst, hwreg:$simm16),
"$simm16, $sdst",
- [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
->;
+ [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
+
+ // Use custom inserter to optimize some cases to
+ // S_DENORM_MODE/S_ROUND_MODE.
+ let usesCustomInserter = 1;
+ let Defs = [MODE];
+ let Uses = [MODE];
+}
// FIXME: Not on SI?
//def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
@@ -786,8 +839,11 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
"$simm16, $imm"> {
let Size = 8; // Unlike every other SOPK instruction.
let has_sdst = 0;
+ let Defs = [MODE];
+ let Uses = [MODE];
}
+}
} // End hasSideEffects = 1
class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
@@ -920,12 +976,16 @@ def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
} // End SubtargetPredicate = isGFX8Plus
let SubtargetPredicate = HasVGPRIndexMode in {
+// Setting the GPR index mode is really writing the fields in the mode
+// register. We don't want to add mode register uses to every
+// instruction, and it's too complicated to deal with anyway. This is
+// modeled just as a side effect.
def S_SET_GPR_IDX_ON : SOPC <0x11,
(outs),
(ins SSrc_b32:$src0, GPRIdxMode:$src1),
"s_set_gpr_idx_on $src0,$src1"> {
- let Defs = [M0]; // No scc def
- let Uses = [M0]; // Other bits of m0 unmodified.
+ let Defs = [M0, MODE]; // No scc def
+ let Uses = [M0, MODE]; // Other bits of mode, m0 unmodified.
let hasSideEffects = 1; // Sets mode.gpr_idx_en
let FixedSize = 1;
}
@@ -1099,7 +1159,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
let mayStore = 1;
}
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
@@ -1112,8 +1172,8 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
"s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
let hasSideEffects = 1;
- let mayLoad = 1;
- let mayStore = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
}
def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
@@ -1138,14 +1198,14 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
[(int_amdgcn_s_incperflevel timm:$simm16)]> {
let hasSideEffects = 1;
- let mayLoad = 1;
- let mayStore = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
}
def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
[(int_amdgcn_s_decperflevel timm:$simm16)]> {
let hasSideEffects = 1;
- let mayLoad = 1;
- let mayStore = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
}
def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
let simm16 = 0;
@@ -1154,6 +1214,8 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
let SubtargetPredicate = HasVGPRIndexMode in {
def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
let simm16 = 0;
+ let Defs = [MODE];
+ let Uses = [MODE];
}
}
} // End hasSideEffects
@@ -1161,7 +1223,8 @@ def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
let SubtargetPredicate = HasVGPRIndexMode in {
def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
"s_set_gpr_idx_mode$simm16"> {
- let Defs = [M0];
+ let Defs = [M0, MODE];
+ let Uses = [MODE];
}
}
@@ -1176,13 +1239,15 @@ let SubtargetPredicate = isGFX10Plus in {
}
def S_WAITCNT_DEPCTR :
SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
- def S_ROUND_MODE :
- SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
- def S_DENORM_MODE :
- SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
- [(SIdenorm_mode (i32 timm:$simm16))]> {
- let hasSideEffects = 1;
- }
+
+ let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
+ def S_ROUND_MODE :
+ SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+ def S_DENORM_MODE :
+ SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+ [(SIdenorm_mode (i32 timm:$simm16))]>;
+ }
+
def S_TTRACEDATA_IMM :
SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
} // End SubtargetPredicate = isGFX10Plus
@@ -1223,7 +1288,7 @@ def : GCNPat <
// Same as a 32-bit inreg
def : GCNPat<
- (i32 (sext i16:$src)),
+ (i32 (UniformUnaryFrag<sext> i16:$src)),
(S_SEXT_I32_I16 $src)
>;
@@ -1250,7 +1315,7 @@ def : GCNPat<
>;
def : GCNPat <
- (i64 (sext i16:$src)),
+ (i64 (UniformUnaryFrag<sext> i16:$src)),
(REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 075e08986c0c0..5819a621f55d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -78,7 +78,11 @@ const char* const IdSymbolic[] = {
"HW_REG_XNACK_MASK",
nullptr, // HW_ID1, no predictable values
nullptr, // HW_ID2, no predictable values
- "HW_REG_POPS_PACKER"
+ "HW_REG_POPS_PACKER",
+ nullptr,
+ nullptr,
+ nullptr,
+ "HW_REG_SHADER_CYCLES"
};
} // namespace Hwreg
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5271bc3aacc65..00e6d517bde58 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -108,6 +108,7 @@ namespace AMDGPU {
#define GET_MIMGInfoTable_IMPL
#define GET_MIMGLZMappingTable_IMPL
#define GET_MIMGMIPMappingTable_IMPL
+#define GET_MIMGG16MappingTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -148,10 +149,17 @@ struct MTBUFInfo {
bool has_soffset;
};
+struct SMInfo {
+ uint16_t Opcode;
+ bool IsBuffer;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
+#define GET_SMInfoTable_DECL
+#define GET_SMInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -214,6 +222,11 @@ bool getMUBUFHasSoffset(unsigned Opc) {
return Info ? Info->has_soffset : false;
}
+bool getSMEMIsBuffer(unsigned Opc) {
+ const SMInfo *Info = getSMEMOpcodeHelper(Opc);
+ return Info ? Info->IsBuffer : false;
+}
+
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
// header files, so we need to wrap it in a function that takes unsigned
// instead.
@@ -268,6 +281,13 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
}
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
+ // "Per CU" really means "per whatever functional block the waves of a
+ // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+ // two SIMDs.
+ if (isGFX10(*STI) && STI->getFeatureBits().test(FeatureCuMode))
+ return 2;
+ // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
+ // two CUs, so a total of four SIMDs.
return 4;
}
@@ -283,15 +303,6 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
return std::min(N, 16u);
}
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
- return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
-}
-
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
- unsigned FlatWorkGroupSize) {
- return getWavesPerWorkGroup(STI, FlatWorkGroupSize);
-}
-
unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
return 1;
}
@@ -300,13 +311,13 @@ unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
if (!isGFX10(*STI))
return 10;
- return 20;
+ return hasGFX10_3Insts(*STI) ? 16 : 20;
}
-unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
- unsigned FlatWorkGroupSize) {
- return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize),
- getEUsPerCU(STI)) / getEUsPerCU(STI);
+unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
+ unsigned FlatWorkGroupSize) {
+ return divideCeil(getWavesPerWorkGroup(STI, FlatWorkGroupSize),
+ getEUsPerCU(STI));
}
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) {
@@ -320,8 +331,7 @@ unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize) {
- return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) /
- getWavefrontSize(STI);
+ return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI));
}
unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
@@ -431,12 +441,21 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
+
+ if (hasGFX10_3Insts(*STI))
+ return IsWave32 ? 16 : 8;
+
return IsWave32 ? 8 : 4;
}
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
Optional<bool> EnableWavefrontSize32) {
- return getVGPRAllocGranule(STI, EnableWavefrontSize32);
+
+ bool IsWave32 = EnableWavefrontSize32 ?
+ *EnableWavefrontSize32 :
+ STI->getFeatureBits().test(FeatureWavefrontSize32);
+
+ return IsWave32 ? 8 : 4;
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
@@ -559,7 +578,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
- return TT.getOS() == Triple::AMDPAL;
+ return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600;
}
int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
@@ -722,13 +741,16 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
return ID_SYMBOLIC_FIRST_GFX9_;
else if (isGFX9(STI))
return ID_SYMBOLIC_FIRST_GFX10_;
+ else if (isGFX10(STI) && !isGFX10_BEncoding(STI))
+ return ID_SYMBOLIC_FIRST_GFX1030_;
else
return ID_SYMBOLIC_LAST_;
}
bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
- return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
- IdSymbolic[Id];
+ return
+ ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+ IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI));
}
bool isValidHwreg(int64_t Id) {
@@ -927,7 +949,15 @@ bool hasSRAMECC(const MCSubtargetInfo &STI) {
}
bool hasMIMG_R128(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+ return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128] && !STI.getFeatureBits()[AMDGPU::FeatureR128A16];
+}
+
+bool hasGFX10A16(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16];
+}
+
+bool hasG16(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureG16];
}
bool hasPackedD16(const MCSubtargetInfo &STI) {
@@ -958,9 +988,17 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
}
+bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
+}
+
+bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
- const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+ const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
Reg == AMDGPU::SCC;
}
@@ -1082,6 +1120,11 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
// (move from MC* level to Target* level). Return size in bits.
unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
+ case AMDGPU::VGPR_LO16RegClassID:
+ case AMDGPU::VGPR_HI16RegClassID:
+ case AMDGPU::SGPR_LO16RegClassID:
+ case AMDGPU::AGPR_LO16RegClassID:
+ return 16;
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::VGPR_32RegClassID:
case AMDGPU::VRegOrLds_32RegClassID:
@@ -1103,6 +1146,7 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
+ case AMDGPU::AReg_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::SReg_128RegClassID:
@@ -1112,14 +1156,24 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::SGPR_160RegClassID:
case AMDGPU::SReg_160RegClassID:
case AMDGPU::VReg_160RegClassID:
+ case AMDGPU::AReg_160RegClassID:
return 160;
+ case AMDGPU::SGPR_192RegClassID:
+ case AMDGPU::SReg_192RegClassID:
+ case AMDGPU::VReg_192RegClassID:
+ case AMDGPU::AReg_192RegClassID:
+ return 192;
+ case AMDGPU::SGPR_256RegClassID:
case AMDGPU::SReg_256RegClassID:
case AMDGPU::VReg_256RegClassID:
+ case AMDGPU::AReg_256RegClassID:
return 256;
+ case AMDGPU::SGPR_512RegClassID:
case AMDGPU::SReg_512RegClassID:
case AMDGPU::VReg_512RegClassID:
case AMDGPU::AReg_512RegClassID:
return 512;
+ case AMDGPU::SGPR_1024RegClassID:
case AMDGPU::SReg_1024RegClassID:
case AMDGPU::VReg_1024RegClassID:
case AMDGPU::AReg_1024RegClassID:
@@ -1141,7 +1195,7 @@ unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
}
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
- if (Literal >= -16 && Literal <= 64)
+ if (isInlinableIntLiteral(Literal))
return true;
uint64_t Val = static_cast<uint64_t>(Literal);
@@ -1158,7 +1212,7 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
}
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
- if (Literal >= -16 && Literal <= 64)
+ if (isInlinableIntLiteral(Literal))
return true;
// The actual type of the operand does not seem to matter as long
@@ -1187,7 +1241,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
if (!HasInv2Pi)
return false;
- if (Literal >= -16 && Literal <= 64)
+ if (isInlinableIntLiteral(Literal))
return true;
uint16_t Val = static_cast<uint16_t>(Literal);
@@ -1217,6 +1271,17 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
}
+bool isInlinableIntLiteralV216(int32_t Literal) {
+ int16_t Lo16 = static_cast<int16_t>(Literal);
+ if (isInt<16>(Literal) || isUInt<16>(Literal))
+ return isInlinableIntLiteral(Lo16);
+
+ int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+ if (!(Literal & 0xffff))
+ return isInlinableIntLiteral(Hi16);
+ return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+}
+
bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();
@@ -1247,16 +1312,61 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
return isGCN3Encoding(ST) || isGFX10(ST);
}
-int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
+ return isGFX9(ST) || isGFX10(ST);
+}
+
+bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
+ int64_t EncodedOffset) {
+ return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset)
+ : isUInt<8>(EncodedOffset);
+}
+
+bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
+ int64_t EncodedOffset,
+ bool IsBuffer) {
+ return !IsBuffer &&
+ hasSMRDSignedImmOffset(ST) &&
+ isInt<21>(EncodedOffset);
+}
+
+static bool isDwordAligned(uint64_t ByteOffset) {
+ return (ByteOffset & 3) == 0;
+}
+
+uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
+ uint64_t ByteOffset) {
if (hasSMEMByteOffset(ST))
return ByteOffset;
+
+ assert(isDwordAligned(ByteOffset));
return ByteOffset >> 2;
}
-bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
- int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
- return (hasSMEMByteOffset(ST)) ?
- isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
+Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
+ int64_t ByteOffset, bool IsBuffer) {
+ // The signed version is always a byte offset.
+ if (!IsBuffer && hasSMRDSignedImmOffset(ST)) {
+ assert(hasSMEMByteOffset(ST));
+ return isInt<20>(ByteOffset) ? Optional<int64_t>(ByteOffset) : None;
+ }
+
+ if (!isDwordAligned(ByteOffset) && !hasSMEMByteOffset(ST))
+ return None;
+
+ int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset);
+ return isLegalSMRDEncodedUnsignedOffset(ST, EncodedOffset)
+ ? Optional<int64_t>(EncodedOffset)
+ : None;
+}
+
+Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
+ int64_t ByteOffset) {
+ if (!isCI(ST) || !isDwordAligned(ByteOffset))
+ return None;
+
+ int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset);
+ return isUInt<32>(EncodedOffset) ? Optional<int64_t>(EncodedOffset) : None;
}
// Given Imm, split it into the values to put into the SOffset and ImmOffset
@@ -1267,8 +1377,8 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
// aligned if they are aligned to begin with. It also ensures that additional
// offsets within the given alignment can be added to the resulting ImmOffset.
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget, uint32_t Align) {
- const uint32_t MaxImm = alignDown(4095, Align);
+ const GCNSubtarget *Subtarget, Align Alignment) {
+ const uint32_t MaxImm = alignDown(4095, Alignment.value());
uint32_t Overflow = 0;
if (Imm > MaxImm) {
@@ -1286,10 +1396,10 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
//
// Atomic operations fail to work correctly when individual address
// components are unaligned, even if their sum is aligned.
- uint32_t High = (Imm + Align) & ~4095;
- uint32_t Low = (Imm + Align) & 4095;
+ uint32_t High = (Imm + Alignment.value()) & ~4095;
+ uint32_t Low = (Imm + Alignment.value()) & 4095;
Imm = Low;
- Overflow = High - Align;
+ Overflow = High - Alignment.value();
}
}
@@ -1305,8 +1415,7 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
return true;
}
-SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
- const GCNSubtarget &ST) {
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
*this = getDefaultForCallingConv(F.getCallingConv());
StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
@@ -1318,8 +1427,25 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
if (!DX10ClampAttr.empty())
DX10Clamp = DX10ClampAttr == "true";
- FP32Denormals = ST.hasFP32Denormals(F);
- FP64FP16Denormals = ST.hasFP64FP16Denormals(F);
+ StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
+ if (!DenormF32Attr.empty()) {
+ DenormalMode DenormMode = parseDenormalFPAttribute(DenormF32Attr);
+ FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+ FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+ }
+
+ StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString();
+ if (!DenormAttr.empty()) {
+ DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
+
+ if (DenormF32Attr.empty()) {
+ FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+ FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+ }
+
+ FP64FP16InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+ FP64FP16OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+ }
}
namespace {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a5bada2890d2c..e71554575f6af 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -12,10 +12,10 @@
#include "AMDGPU.h"
#include "AMDKernelCodeT.h"
#include "SIDefines.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetParser.h"
@@ -26,17 +26,13 @@
namespace llvm {
class Argument;
-class AMDGPUSubtarget;
-class FeatureBitset;
class Function;
class GCNSubtarget;
class GlobalValue;
-class MCContext;
class MCRegisterClass;
class MCRegisterInfo;
-class MCSection;
class MCSubtargetInfo;
-class MachineMemOperand;
+class StringRef;
class Triple;
namespace AMDGPU {
@@ -87,15 +83,6 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI);
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize);
-/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// STI without any kind of limitation.
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI);
-
-/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// STI and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
- unsigned FlatWorkGroupSize);
-
/// \returns Minimum number of waves per execution unit for given subtarget \p
/// STI.
unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
@@ -104,10 +91,10 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
/// STI without any kind of limitation.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
-/// \returns Maximum number of waves per execution unit for given subtarget \p
-/// STI and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
- unsigned FlatWorkGroupSize);
+/// \returns Number of waves per execution unit required to support the given \p
+/// FlatWorkGroupSize.
+unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
+ unsigned FlatWorkGroupSize);
/// \returns Minimum flat work group size for given subtarget \p STI.
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
@@ -116,7 +103,7 @@ unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
/// \returns Number of waves per work group for given subtarget \p STI and
-/// limited by given \p FlatWorkGroupSize.
+/// \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
unsigned FlatWorkGroupSize);
@@ -211,6 +198,7 @@ struct MIMGBaseOpcodeInfo {
uint8_t NumExtraArgs;
bool Gradients;
+ bool G16;
bool Coordinates;
bool LodOrClampOrMip;
bool HasD16;
@@ -247,11 +235,19 @@ struct MIMGMIPMappingInfo {
MIMGBaseOpcode NONMIP;
};
+struct MIMGG16MappingInfo {
+ MIMGBaseOpcode G;
+ MIMGBaseOpcode G16;
+};
+
LLVM_READONLY
const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
LLVM_READONLY
-const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L);
+const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
+
+LLVM_READONLY
+const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G);
LLVM_READONLY
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -308,6 +304,9 @@ LLVM_READONLY
bool getMUBUFHasSoffset(unsigned Opc);
LLVM_READONLY
+bool getSMEMIsBuffer(unsigned Opc);
+
+LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
@@ -551,6 +550,8 @@ inline bool isKernel(CallingConv::ID CC) {
bool hasXNACK(const MCSubtargetInfo &STI);
bool hasSRAMECC(const MCSubtargetInfo &STI);
bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasGFX10A16(const MCSubtargetInfo &STI);
+bool hasG16(const MCSubtargetInfo &STI);
bool hasPackedD16(const MCSubtargetInfo &STI);
bool isSI(const MCSubtargetInfo &STI);
@@ -558,6 +559,9 @@ bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
+bool isGCN3Encoding(const MCSubtargetInfo &STI);
+bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
+bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -633,6 +637,13 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
return getOperandSize(Desc.OpInfo[OpNo]);
}
+/// Is this literal inlinable, and not one of the values intended for floating
+/// point values.
+LLVM_READNONE
+inline bool isInlinableIntLiteral(int64_t Literal) {
+ return Literal >= -16 && Literal <= 64;
+}
+
/// Is this literal inlinable
LLVM_READNONE
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
@@ -646,11 +657,35 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
LLVM_READNONE
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+LLVM_READNONE
+bool isInlinableIntLiteralV216(int32_t Literal);
+
bool isArgPassedInSGPR(const Argument *Arg);
-/// \returns The encoding that will be used for \p ByteOffset in the SMRD
-/// offset field.
-int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+LLVM_READONLY
+bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
+ int64_t EncodedOffset);
+
+LLVM_READONLY
+bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
+ int64_t EncodedOffset,
+ bool IsBuffer);
+
+/// Convert \p ByteOffset to dwords if the subtarget uses dword SMRD immediate
+/// offsets.
+uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
+
+/// \returns The encoding that will be used for \p ByteOffset in the
+/// SMRD offset field, or None if it won't fit. On GFX9 and GFX10
+/// S_LOAD instructions have a signed offset, on other subtargets it is
+/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
+Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
+ int64_t ByteOffset, bool IsBuffer);
+
+/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
+/// instruction. This is only useful on CI.s
+Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
+ int64_t ByteOffset);
/// \returns true if this offset is small enough to fit in the SMRD
/// offset field. \p ByteOffset should be the offset in bytes and
@@ -658,7 +693,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget, uint32_t Align = 4);
+ const GCNSubtarget *Subtarget,
+ Align Alignment = Align(4));
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
@@ -677,45 +713,76 @@ struct SIModeRegisterDefaults {
/// If this is set, neither input or output denormals are flushed for most f32
/// instructions.
- ///
- /// TODO: Split into separate input and output fields if necessary like the
- /// control bits really provide?
- bool FP32Denormals : 1;
+ bool FP32InputDenormals : 1;
+ bool FP32OutputDenormals : 1;
/// If this is set, neither input or output denormals are flushed for both f64
/// and f16/v2f16 instructions.
- bool FP64FP16Denormals : 1;
+ bool FP64FP16InputDenormals : 1;
+ bool FP64FP16OutputDenormals : 1;
SIModeRegisterDefaults() :
IEEE(true),
DX10Clamp(true),
- FP32Denormals(true),
- FP64FP16Denormals(true) {}
+ FP32InputDenormals(true),
+ FP32OutputDenormals(true),
+ FP64FP16InputDenormals(true),
+ FP64FP16OutputDenormals(true) {}
- // FIXME: Should not depend on the subtarget
- SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST);
+ SIModeRegisterDefaults(const Function &F);
static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
const bool IsCompute = AMDGPU::isCompute(CC);
SIModeRegisterDefaults Mode;
- Mode.DX10Clamp = true;
Mode.IEEE = IsCompute;
- Mode.FP32Denormals = false; // FIXME: Should be on by default.
- Mode.FP64FP16Denormals = true;
return Mode;
}
bool operator ==(const SIModeRegisterDefaults Other) const {
return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp &&
- FP32Denormals == Other.FP32Denormals &&
- FP64FP16Denormals == Other.FP64FP16Denormals;
+ FP32InputDenormals == Other.FP32InputDenormals &&
+ FP32OutputDenormals == Other.FP32OutputDenormals &&
+ FP64FP16InputDenormals == Other.FP64FP16InputDenormals &&
+ FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals;
+ }
+
+ bool allFP32Denormals() const {
+ return FP32InputDenormals && FP32OutputDenormals;
+ }
+
+ bool allFP64FP16Denormals() const {
+ return FP64FP16InputDenormals && FP64FP16OutputDenormals;
+ }
+
+ /// Get the encoding value for the FP_DENORM bits of the mode register for the
+ /// FP32 denormal mode.
+ uint32_t fpDenormModeSPValue() const {
+ if (FP32InputDenormals && FP32OutputDenormals)
+ return FP_DENORM_FLUSH_NONE;
+ if (FP32InputDenormals)
+ return FP_DENORM_FLUSH_OUT;
+ if (FP32OutputDenormals)
+ return FP_DENORM_FLUSH_IN;
+ return FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ }
+
+ /// Get the encoding value for the FP_DENORM bits of the mode register for the
+ /// FP64/FP16 denormal mode.
+ uint32_t fpDenormModeDPValue() const {
+ if (FP64FP16InputDenormals && FP64FP16OutputDenormals)
+ return FP_DENORM_FLUSH_NONE;
+ if (FP64FP16InputDenormals)
+ return FP_DENORM_FLUSH_OUT;
+ if (FP64FP16OutputDenormals)
+ return FP_DENORM_FLUSH_IN;
+ return FP_DENORM_FLUSH_IN_FLUSH_OUT;
}
/// Returns true if a flag is compatible if it's enabled in the callee, but
/// disabled in the caller.
static bool oneWayCompatible(bool CallerMode, bool CalleeMode) {
- return CallerMode == CalleeMode || (CallerMode && !CalleeMode);
+ return CallerMode == CalleeMode || (!CallerMode && CalleeMode);
}
// FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
@@ -727,8 +794,10 @@ struct SIModeRegisterDefaults {
return false;
// Allow inlining denormals enabled into denormals flushed functions.
- return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) &&
- oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals);
+ return oneWayCompatible(FP64FP16InputDenormals, CalleeMode.FP64FP16InputDenormals) &&
+ oneWayCompatible(FP64FP16OutputDenormals, CalleeMode.FP64FP16OutputDenormals) &&
+ oneWayCompatible(FP32InputDenormals, CalleeMode.FP32InputDenormals) &&
+ oneWayCompatible(FP32OutputDenormals, CalleeMode.FP32OutputDenormals);
}
};
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 207e4232e8298..ef010a7ac1576 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -397,6 +397,39 @@ static const char *getRegisterName(unsigned RegNum) {
{0x2c6a, "SPI_SHADER_USER_DATA_VS_30"},
{0x2c6b, "SPI_SHADER_USER_DATA_VS_31"},
+ {0x2c8c, "SPI_SHADER_USER_DATA_GS_0"},
+ {0x2c8d, "SPI_SHADER_USER_DATA_GS_1"},
+ {0x2c8e, "SPI_SHADER_USER_DATA_GS_2"},
+ {0x2c8f, "SPI_SHADER_USER_DATA_GS_3"},
+ {0x2c90, "SPI_SHADER_USER_DATA_GS_4"},
+ {0x2c91, "SPI_SHADER_USER_DATA_GS_5"},
+ {0x2c92, "SPI_SHADER_USER_DATA_GS_6"},
+ {0x2c93, "SPI_SHADER_USER_DATA_GS_7"},
+ {0x2c94, "SPI_SHADER_USER_DATA_GS_8"},
+ {0x2c95, "SPI_SHADER_USER_DATA_GS_9"},
+ {0x2c96, "SPI_SHADER_USER_DATA_GS_10"},
+ {0x2c97, "SPI_SHADER_USER_DATA_GS_11"},
+ {0x2c98, "SPI_SHADER_USER_DATA_GS_12"},
+ {0x2c99, "SPI_SHADER_USER_DATA_GS_13"},
+ {0x2c9a, "SPI_SHADER_USER_DATA_GS_14"},
+ {0x2c9b, "SPI_SHADER_USER_DATA_GS_15"},
+ {0x2c9c, "SPI_SHADER_USER_DATA_GS_16"},
+ {0x2c9d, "SPI_SHADER_USER_DATA_GS_17"},
+ {0x2c9e, "SPI_SHADER_USER_DATA_GS_18"},
+ {0x2c9f, "SPI_SHADER_USER_DATA_GS_19"},
+ {0x2ca0, "SPI_SHADER_USER_DATA_GS_20"},
+ {0x2ca1, "SPI_SHADER_USER_DATA_GS_21"},
+ {0x2ca2, "SPI_SHADER_USER_DATA_GS_22"},
+ {0x2ca3, "SPI_SHADER_USER_DATA_GS_23"},
+ {0x2ca4, "SPI_SHADER_USER_DATA_GS_24"},
+ {0x2ca5, "SPI_SHADER_USER_DATA_GS_25"},
+ {0x2ca6, "SPI_SHADER_USER_DATA_GS_26"},
+ {0x2ca7, "SPI_SHADER_USER_DATA_GS_27"},
+ {0x2ca8, "SPI_SHADER_USER_DATA_GS_28"},
+ {0x2ca9, "SPI_SHADER_USER_DATA_GS_29"},
+ {0x2caa, "SPI_SHADER_USER_DATA_GS_30"},
+ {0x2cab, "SPI_SHADER_USER_DATA_GS_31"},
+
{0x2ccc, "SPI_SHADER_USER_DATA_ES_0"},
{0x2ccd, "SPI_SHADER_USER_DATA_ES_1"},
{0x2cce, "SPI_SHADER_USER_DATA_ES_2"},
@@ -491,38 +524,55 @@ static const char *getRegisterName(unsigned RegNum) {
{0xa310, "PA_SC_SHADER_CONTROL"},
{0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"},
- {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"},
- {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"},
- {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"},
- {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"},
- {0x2d10, "SPI_SHADER_USER_DATA_LS_4"},
- {0x2d11, "SPI_SHADER_USER_DATA_LS_5"},
- {0x2d12, "SPI_SHADER_USER_DATA_LS_6"},
- {0x2d13, "SPI_SHADER_USER_DATA_LS_7"},
- {0x2d14, "SPI_SHADER_USER_DATA_LS_8"},
- {0x2d15, "SPI_SHADER_USER_DATA_LS_9"},
- {0x2d16, "SPI_SHADER_USER_DATA_LS_10"},
- {0x2d17, "SPI_SHADER_USER_DATA_LS_11"},
- {0x2d18, "SPI_SHADER_USER_DATA_LS_12"},
- {0x2d19, "SPI_SHADER_USER_DATA_LS_13"},
- {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"},
- {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"},
- {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"},
- {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"},
- {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"},
- {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"},
- {0x2d20, "SPI_SHADER_USER_DATA_LS_20"},
- {0x2d21, "SPI_SHADER_USER_DATA_LS_21"},
- {0x2d22, "SPI_SHADER_USER_DATA_LS_22"},
- {0x2d23, "SPI_SHADER_USER_DATA_LS_23"},
- {0x2d24, "SPI_SHADER_USER_DATA_LS_24"},
- {0x2d25, "SPI_SHADER_USER_DATA_LS_25"},
- {0x2d26, "SPI_SHADER_USER_DATA_LS_26"},
- {0x2d27, "SPI_SHADER_USER_DATA_LS_27"},
- {0x2d28, "SPI_SHADER_USER_DATA_LS_28"},
- {0x2d29, "SPI_SHADER_USER_DATA_LS_29"},
- {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"},
- {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"},
+ {0x2d0c, "SPI_SHADER_USER_DATA_HS_0"},
+ {0x2d0d, "SPI_SHADER_USER_DATA_HS_1"},
+ {0x2d0e, "SPI_SHADER_USER_DATA_HS_2"},
+ {0x2d0f, "SPI_SHADER_USER_DATA_HS_3"},
+ {0x2d10, "SPI_SHADER_USER_DATA_HS_4"},
+ {0x2d11, "SPI_SHADER_USER_DATA_HS_5"},
+ {0x2d12, "SPI_SHADER_USER_DATA_HS_6"},
+ {0x2d13, "SPI_SHADER_USER_DATA_HS_7"},
+ {0x2d14, "SPI_SHADER_USER_DATA_HS_8"},
+ {0x2d15, "SPI_SHADER_USER_DATA_HS_9"},
+ {0x2d16, "SPI_SHADER_USER_DATA_HS_10"},
+ {0x2d17, "SPI_SHADER_USER_DATA_HS_11"},
+ {0x2d18, "SPI_SHADER_USER_DATA_HS_12"},
+ {0x2d19, "SPI_SHADER_USER_DATA_HS_13"},
+ {0x2d1a, "SPI_SHADER_USER_DATA_HS_14"},
+ {0x2d1b, "SPI_SHADER_USER_DATA_HS_15"},
+ {0x2d1c, "SPI_SHADER_USER_DATA_HS_16"},
+ {0x2d1d, "SPI_SHADER_USER_DATA_HS_17"},
+ {0x2d1e, "SPI_SHADER_USER_DATA_HS_18"},
+ {0x2d1f, "SPI_SHADER_USER_DATA_HS_19"},
+ {0x2d20, "SPI_SHADER_USER_DATA_HS_20"},
+ {0x2d21, "SPI_SHADER_USER_DATA_HS_21"},
+ {0x2d22, "SPI_SHADER_USER_DATA_HS_22"},
+ {0x2d23, "SPI_SHADER_USER_DATA_HS_23"},
+ {0x2d24, "SPI_SHADER_USER_DATA_HS_24"},
+ {0x2d25, "SPI_SHADER_USER_DATA_HS_25"},
+ {0x2d26, "SPI_SHADER_USER_DATA_HS_26"},
+ {0x2d27, "SPI_SHADER_USER_DATA_HS_27"},
+ {0x2d28, "SPI_SHADER_USER_DATA_HS_28"},
+ {0x2d29, "SPI_SHADER_USER_DATA_HS_29"},
+ {0x2d2a, "SPI_SHADER_USER_DATA_HS_30"},
+ {0x2d2b, "SPI_SHADER_USER_DATA_HS_31"},
+
+ {0x2d4c, "SPI_SHADER_USER_DATA_LS_0"},
+ {0x2d4d, "SPI_SHADER_USER_DATA_LS_1"},
+ {0x2d4e, "SPI_SHADER_USER_DATA_LS_2"},
+ {0x2d4f, "SPI_SHADER_USER_DATA_LS_3"},
+ {0x2d50, "SPI_SHADER_USER_DATA_LS_4"},
+ {0x2d51, "SPI_SHADER_USER_DATA_LS_5"},
+ {0x2d52, "SPI_SHADER_USER_DATA_LS_6"},
+ {0x2d53, "SPI_SHADER_USER_DATA_LS_7"},
+ {0x2d54, "SPI_SHADER_USER_DATA_LS_8"},
+ {0x2d55, "SPI_SHADER_USER_DATA_LS_9"},
+ {0x2d56, "SPI_SHADER_USER_DATA_LS_10"},
+ {0x2d57, "SPI_SHADER_USER_DATA_LS_11"},
+ {0x2d58, "SPI_SHADER_USER_DATA_LS_12"},
+ {0x2d59, "SPI_SHADER_USER_DATA_LS_13"},
+ {0x2d5a, "SPI_SHADER_USER_DATA_LS_14"},
+ {0x2d5b, "SPI_SHADER_USER_DATA_LS_15"},
{0xa2aa, "IA_MULTI_VGT_PARAM"},
{0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 0f17c157b2062..544ab669d9ae2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -14,16 +14,12 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
-#include "llvm/ADT/StringRef.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
-#include <map>
namespace llvm {
-class AMDGPUTargetStreamer;
-class formatted_raw_ostream;
-class MCStreamer;
class Module;
+class StringRef;
class AMDGPUPALMetadata {
unsigned BlobType = 0;
diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td
deleted file mode 100644
index ec7d8875a746e..0000000000000
--- a/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ /dev/null
@@ -1,13 +0,0 @@
-//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Instruction definitions for VI and newer.
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index c7aed0985540a..17f334f62a30b 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1,4 +1,4 @@
-//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOP1Instructions.td - Vector Instruction Definitions --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -48,9 +48,13 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let mayStore = 0;
let hasSideEffects = 0;
+ let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+ let mayRaiseFPException = ReadsModeReg;
+
let VOP1 = 1;
let VALU = 1;
- let Uses = [EXEC];
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let AsmVariantName = AMDGPUAsmVariants.Default;
}
@@ -89,9 +93,7 @@ class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret =
!if(P.HasModifiers,
- [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
- i32:$src0_modifiers,
- i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers))))],
!if(P.HasOMod,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
i1:$clamp, i32:$omod))))],
@@ -102,8 +104,13 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
- def _e32 : VOP1_Pseudo <opName, P>;
- def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+ // We only want to set this on the basic, non-SDWA or DPP forms.
+ defvar should_mov_imm = !eq(opName, "v_mov_b32");
+
+ let isMoveImm = should_mov_imm in {
+ def _e32 : VOP1_Pseudo <opName, P>;
+ def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+ }
foreach _ = BoolToList<P.HasExtSDWA>.ret in
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
@@ -146,7 +153,7 @@ let VOPAsmPrefer32Bit = 1 in {
defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
}
-let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
} // End isMoveImm = 1
@@ -183,31 +190,51 @@ def V_READFIRSTLANE_B32 :
let SchedRW = [WriteDoubleCvt] in {
defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+
+let mayRaiseFPException = 0 in {
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+}
+
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+
+let mayRaiseFPException = 0 in {
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
+}
+
} // End SchedRW = [WriteDoubleCvt]
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteFloatCvt] in {
+
+// XXX: Does this really not raise exceptions? The manual claims the
+// 16-bit ones can.
+let mayRaiseFPException = 0 in {
defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
+}
+
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
let FPDPRounding = 1 in {
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
} // End FPDPRounding = 1
+
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
+
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End ReadsModeReg = 0, mayRaiseFPException = 0
+} // End SchedRW = [WriteFloatCvt]
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
+} // ReadsModeReg = 0, mayRaiseFPException = 0
defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
@@ -215,33 +242,30 @@ defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteTrans32] in {
defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
-} // End SchedRW = [WriteQuarterRate32]
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
+} // End SchedRW = [WriteTrans32]
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteTrans64] in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
-} // End SchedRW = [WriteDouble];
-
-let SchedRW = [WriteDouble] in {
-defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
-} // End SchedRW = [WriteDouble]
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
+} // End SchedRW = [WriteTrans64]
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteTrans32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteTrans32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
-defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
let SchedRW = [WriteDoubleAdd] in {
@@ -317,7 +341,7 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>;
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
let SubtargetPredicate = isGFX6GFX7 in {
- let SchedRW = [WriteQuarterRate32] in {
+ let SchedRW = [WriteTrans32] in {
defm V_LOG_CLAMP_F32 :
VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
defm V_RCP_CLAMP_F32 :
@@ -327,8 +351,8 @@ let SubtargetPredicate = isGFX6GFX7 in {
defm V_RSQ_CLAMP_F32 :
VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
defm V_RSQ_LEGACY_F32 :
- VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
- } // End SchedRW = [WriteQuarterRate32]
+ VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
+ } // End SchedRW = [WriteTrans32]
let SchedRW = [WriteDouble] in {
defm V_RCP_CLAMP_F64 :
@@ -339,10 +363,10 @@ let SubtargetPredicate = isGFX6GFX7 in {
} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX7GFX8GFX9 in {
- let SchedRW = [WriteQuarterRate32] in {
+ let SchedRW = [WriteTrans32] in {
defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>;
defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>;
- } // End SchedRW = [WriteQuarterRate32]
+ } // End SchedRW = [WriteTrans32]
} // End SubtargetPredicate = isGFX7GFX8GFX9
let SubtargetPredicate = isGFX7Plus in {
@@ -362,15 +386,15 @@ defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
} // End FPDPRounding = 1
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
-defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
+defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -414,8 +438,11 @@ let SubtargetPredicate = isGFX9Plus in {
}
defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
- defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
- defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
+ let mayRaiseFPException = 0 in {
+ defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+ defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+ } // End mayRaiseFPException = 0
} // End SubtargetPredicate = isGFX9Plus
let SubtargetPredicate = isGFX9Only in {
@@ -458,7 +485,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
VOP1_DPP<op, ps, p, 1>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
- let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+ let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = HasDPP16;
}
@@ -475,7 +502,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f;
- let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+ let AssemblerPredicate = HasDPP8;
let SubtargetPredicate = HasDPP8;
}
@@ -812,42 +839,23 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs),
let SubtargetPredicate = isGFX8GFX9;
}
-// This is a pseudo variant of the v_movreld_b32 instruction in which the
-// vector operand appears only twice, once as def and once as use. Using this
-// pseudo avoids problems with the Two Address instructions pass.
-class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
- (outs rc:$vdst),
- (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
- let VOP1 = 1;
-
- let Constraints = "$vsrc = $vdst";
- let Uses = [M0, EXEC];
-
- let SubtargetPredicate = HasMovrel;
-}
-
-def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
-def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
-def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
-def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
-def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-
let OtherPredicates = [isGFX8Plus] in {
def : GCNPat <
- (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
- timm:$bound_ctrl)),
- (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl))
+ (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
+ (V_MOV_B32_dpp VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp_ctrl),
+ (as_i32timm $row_mask), (as_i32timm $bank_mask),
+ (as_i1timm $bound_ctrl))
>;
def : GCNPat <
- (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask,
- timm:$bank_mask, timm:$bound_ctrl)),
- (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl))
+ (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl,
+ timm:$row_mask, timm:$bank_mask,
+ timm:$bound_ctrl)),
+ (V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl),
+ (as_i32timm $row_mask), (as_i32timm $bank_mask),
+ (as_i1timm $bound_ctrl))
>;
} // End OtherPredicates = [isGFX8Plus]
@@ -907,6 +915,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
let OtherPredicates = [isGFX10Plus] in {
def : GCNPat <
(i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
- (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
+ (V_MOV_B32_dpp8_gfx10 VGPR_32:$src, VGPR_32:$src,
+ (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
>;
} // End OtherPredicates = [isGFX10Plus]
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index aaadc3dbc7215..aa37dbf1418f9 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1,4 +1,4 @@
-//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOP2Instructions.td - Vector Instruction Definitions --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -69,9 +69,13 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let mayStore = 0;
let hasSideEffects = 0;
+ let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+ let mayRaiseFPException = ReadsModeReg;
+
let VOP2 = 1;
let VALU = 1;
- let Uses = [EXEC];
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let AsmVariantName = AMDGPUAsmVariants.Default;
}
@@ -459,17 +463,18 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
//===----------------------------------------------------------------------===//
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+let SubtargetPredicate = HasMadMacF32Insts in
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
+defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
-defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
-defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
+defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>;
defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
-defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>;
defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
@@ -484,12 +489,16 @@ defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
+let mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasMadMacF32Insts in {
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
}
def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
+} // End SubtargetPredicate = HasMadMacF32Insts
+}
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
@@ -529,8 +538,12 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+}
+
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
@@ -541,14 +554,18 @@ defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmi
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
} // End SubtargetPredicate = isGFX6GFX7
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
let isCommutable = 1 in {
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let OtherPredicates = [HasMadMacF32Insts] in
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
-} // End isCommutable = 1
} // End SubtargetPredicate = isGFX6GFX7GFX10
+let SubtargetPredicate = isGFX6GFX7 in {
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
+} // End SubtargetPredicate = isGFX6GFX7
+} // End isCommutable = 1
+
class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
GCNPat<
@@ -617,15 +634,19 @@ defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
let isCommutable = 1 in {
let FPDPRounding = 1 in {
-defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
+defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, any_fadd>;
defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
-defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
+defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, any_fmul>;
+
+let mayRaiseFPException = 0 in {
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
+}
+
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
@@ -770,16 +791,16 @@ let Predicates = [Has16BitInsts] in {
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : GCNPat<
- (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
- (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
+ (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+ (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
>;
let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
def : GCNPat<
- (i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
- (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
+ (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))),
+ (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
>;
defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
@@ -831,7 +852,7 @@ class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP2_DPP<op, ps, opName, p, 1> {
- let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+ let AssemblerPredicate = HasDPP16;
let SubtargetPredicate = HasDPP16;
}
@@ -857,7 +878,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
let Inst{30-25} = op;
let Inst{31} = 0x0;
- let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+ let AssemblerPredicate = HasDPP8;
let SubtargetPredicate = HasDPP8;
}
@@ -1250,9 +1271,9 @@ defm V_SUBBREV_U32 : VOP2be_Real_gfx6_gfx7<0x02a>;
defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
let SubtargetPredicate = isGFX6GFX7 in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
@@ -1261,6 +1282,7 @@ let SubtargetPredicate = isGFX6GFX7 in {
defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
+let OtherPredicates = [HasMadMacF32Insts] in
defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
@@ -1593,3 +1615,9 @@ let SubtargetPredicate = HasDot3Insts in {
let SubtargetPredicate = HasPkFmacF16Inst in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>;
} // End SubtargetPredicate = HasPkFmacF16Inst
+
+let SubtargetPredicate = HasDot3Insts in {
+ // NB: Opcode conflicts with V_DOT2C_F32_F16
+ let DecoderNamespace = "GFX10_B" in
+ defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>;
+}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 67c8b926302d5..169949f2171ae 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1,4 +1,4 @@
-//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOP3Instructions.td - Vector Instruction Definitions --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -32,20 +32,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
ret1));
}
-class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> {
+ dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers));
+ dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers));
+ dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers));
+ dag clamp_dag = (i1 timm:$clamp);
+
list<dag> ret3 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
- (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
- (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
- (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+ !if(HasExplicitClamp,
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
- (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
- (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+ !if(HasExplicitClamp,
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
+ (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+ !if(HasExplicitClamp,
+ (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
+ (DivergentFragOrOp<node, P>.ret src0_dag)))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -54,18 +60,16 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret3 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
- (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
(P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
- (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
- (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
+ (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -74,18 +78,18 @@ class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret3 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers),
(VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
(P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
list<dag> ret2 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+ (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)),
(P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
(P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
list<dag> ret1 = [(set P.DstVT:$vdst,
- (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+ (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))];
list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
!if(!eq(P.NumSrcArgs, 2), ret2,
@@ -224,12 +228,13 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
VOP3_Pseudo<OpName, P, pattern> {
let AsmMatchConverter = "cvtVOP3Interp";
+ let mayRaiseFPException = 0;
}
def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
Attr:$attr, AttrChan:$attrchan,
- clampmod:$clamp, omod:$omod);
+ clampmod0:$clamp, omod0:$omod);
let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod";
}
@@ -237,7 +242,7 @@ def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
let Ins64 = (ins InterpSlot:$src0,
Attr:$attr, AttrChan:$attrchan,
- clampmod:$clamp, omod:$omod);
+ clampmod0:$clamp, omod0:$omod);
let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
@@ -286,17 +291,25 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
let isCommutable = 1 in {
+let mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasMadMacF32Insts in {
def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+} // End SubtargetPredicate = HasMadMacInsts
+
+let SubtargetPredicate = HasNoMadMacF32Insts in
+def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+}
+
def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
+def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
-def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
+def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
} // End FPDPRounding = 1
def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
@@ -310,7 +323,7 @@ def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
} // End SchedRW = [WriteQuarterRate32]
-let Uses = [VCC, EXEC] in {
+let Uses = [MODE, VCC, EXEC] in {
// v_div_fmas_f32:
// result = src0 * src1 + src2
// if (vcc)
@@ -332,15 +345,20 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []>
} // End isCommutable = 1
+let mayRaiseFPException = 0 in {
def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+} // End mayRaiseFPException
+
def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
-def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbit>;
+def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+
+let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
@@ -350,6 +368,8 @@ def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDG
def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+} // End mayRaiseFPException = 0
+
def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -362,6 +382,8 @@ def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_
def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
+
+let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
let SchedRW = [WriteFloatFMA, WriteSALU];
let AsmMatchConverter = "";
@@ -373,6 +395,7 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
let AsmMatchConverter = "";
let FPDPRounding = 1;
}
+} // End mayRaiseFPException = 0
def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -380,17 +403,16 @@ let Constraints = "@earlyclobber $vdst" in {
def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
} // End Constraints = "@earlyclobber $vdst"
-def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
+def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop> {
let SchedRW = [WriteDouble];
}
let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let SubtargetPredicate = isGFX6GFX7 in {
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
-def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10
+} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX8Plus in {
def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
@@ -399,6 +421,23 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
+def : GCNPat<
+ (i64 (getDivergentFrag<sext>.ret i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+ (i32 (COPY_TO_REGCLASS
+ (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+ ), VGPR_32)), sub1)
+>;
+
+def : GCNPat<
+ (i32 (getDivergentFrag<sext>.ret i16:$src)),
+ (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+>;
+
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+} // End SubtargetPredicate = isGFX6GFX7GFX10
let SchedRW = [Write32Bit] in {
let SubtargetPredicate = isGFX8Plus in {
@@ -417,7 +456,7 @@ let isCommutable = 1 in {
let SchedRW = [WriteQuarterRate32, WriteSALU] in {
def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
-} // End SchedRW = [WriteDouble, WriteSALU]
+} // End SchedRW = [WriteQuarterRate32, WriteSALU]
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX7Plus
@@ -434,11 +473,11 @@ def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
let FPDPRounding = 1;
}
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma> {
let Predicates = [Has16BitInsts, isGFX8Only];
let FPDPRounding = 1;
}
-def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
+def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma> {
let renamedInGFX9 = 1;
let Predicates = [Has16BitInsts, isGFX9Plus];
let FPDPRounding = 1;
@@ -451,7 +490,7 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL
def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
let FPDPRounding = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
// For some reason the intrinsic operands are in a different order
// from the instruction operands.
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
@@ -462,7 +501,7 @@ def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i3
(i32 timm:$attr),
(i1 timm:$high),
M0))]>;
-} // End Uses = [M0, EXEC]
+} // End Uses = [M0, MODE, EXEC]
} // End FPDPRounding = 1
} // End renamedInGFX9 = 1
@@ -478,32 +517,29 @@ def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9Plus
-let Uses = [M0, EXEC], FPDPRounding = 1 in {
+let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
- [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan),
- (i32 timm:$attr),
- (i32 timm:$src0_modifiers),
- (i1 timm:$high),
- (i1 timm:$clamp),
- (i32 timm:$omod)))]>;
-def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
- [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan),
- (i32 timm:$attr),
- (i32 timm:$src0_modifiers),
- (f32 VRegSrc_32:$src2),
- (i32 timm:$src2_modifiers),
- (i1 timm:$high),
- (i1 timm:$clamp),
- (i32 timm:$omod)))]>;
-} // End Uses = [M0, EXEC], FPDPRounding = 1
+ [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
+ (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i1 timm:$high), M0))]> {
+ // This predicate should only apply to the selection pattern. The
+ // instruction still exists and should decode on subtargets with
+ // other bank counts.
+ let OtherPredicates = [has32BankLDS];
+}
+
+
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
-let SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC] in {
+let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-} // End SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC]
+} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
@@ -565,9 +601,20 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
}
return true;
- }]
-> {
+ }]> {
let PredicateCodeUsesOperands = 1;
+
+ // The divergence predicate is irrelevant in GlobalISel, as we have
+ // proper register bank checks. We also force all VOP instruction
+ // operands to VGPR, so we should not need to check the constant bus
+ // restriction.
+ //
+ // FIXME: With unlucky SGPR operands, we could penalize code by
+ // blocking folding SGPR->VGPR copies later.
+ // FIXME: There's no register bank verifier
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
}
let SubtargetPredicate = isGFX9Plus in {
@@ -602,14 +649,14 @@ def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32,
def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
-def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
-def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
// This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
(ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
- (inst i32:$src0, i32:$src1, i32:$src2)
+ (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
>;
def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
@@ -634,6 +681,40 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
let HasOMod = 0;
}
+class PermlanePat<SDPatternOperator permlane,
+ Instruction inst> : GCNPat<
+ (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
+ timm:$fi, timm:$bc),
+ (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+ SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
+>;
+
+// Permlane intrinsic that has either fetch invalid or bound control
+// fields enabled.
+class BoundControlOrFetchInvalidPermlane<SDPatternOperator permlane> :
+ PatFrag<(ops node:$vdst_in, node:$src0, node:$src1, node:$src2,
+ node:$fi, node:$bc),
+ (permlane node:$vdst_in, node:$src0, node:
+ $src1, node:$src2, node:$fi, node:$bc)> {
+ let PredicateCode = [{ return N->getConstantOperandVal(5) != 0 ||
+ N->getConstantOperandVal(6) != 0; }];
+ let GISelPredicateCode = [{
+ return MI.getOperand(6).getImm() != 0 ||
+ MI.getOperand(7).getImm() != 0;
+ }];
+}
+
+// Drop the input value if it won't be read.
+class PermlaneDiscardVDstIn<SDPatternOperator permlane,
+ Instruction inst> : GCNPat<
+ (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2,
+ timm:$fi, timm:$bc),
+ (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+ SCSrc_b32:$src1, 0, SCSrc_b32:$src2,
+ (IMPLICIT_DEF))
+>;
+
+
let SubtargetPredicate = isGFX10Plus in {
def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
@@ -643,16 +724,35 @@ let SubtargetPredicate = isGFX10Plus in {
def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
- def : GCNPat<
- (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
- (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
- >;
- def : GCNPat<
- (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
- (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
- >;
+ def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32>;
+ def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32>;
+
+ def : PermlaneDiscardVDstIn<
+ BoundControlOrFetchInvalidPermlane<int_amdgcn_permlane16>,
+ V_PERMLANE16_B32>;
+ def : PermlaneDiscardVDstIn<
+ BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
+ V_PERMLANEX16_B32>;
} // End SubtargetPredicate = isGFX10Plus
+class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
+ (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)),
+ (vt (VOP3Mods vt:$src2, i32:$src2_modifiers)),
+ (i1 CondReg)),
+ (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2)
+>;
+
+let WaveSizePredicate = isWave64 in {
+def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC>;
+}
+
+let WaveSizePredicate = isWave32 in {
+def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC_LO>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC_LO>;
+}
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
@@ -745,9 +845,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
defm V_READLANE_B32 : VOP3_Real_gfx10<0x360>;
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
defm V_XOR3_B32 : VOP3_Real_gfx10<0x178>;
defm V_LSHLREV_B64 : VOP3_Real_gfx10<0x2ff>;
@@ -925,6 +1025,10 @@ defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>;
defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>;
defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>;
+// NB: Same opcode as v_mad_legacy_f32
+let DecoderNamespace = "GFX10_B" in
+defm V_FMA_LEGACY_F32 : VOP3_Real_gfx10<0x140>;
+
//===----------------------------------------------------------------------===//
// GFX8, GFX9 (VI).
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 933acc2278fd8..fc457ad212d48 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1,4 +1,4 @@
-//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
+//===-- VOP3PInstructions.td - Vector Instruction Definitions -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -10,9 +10,11 @@
// VOP3P Classes
//===----------------------------------------------------------------------===//
-class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+class VOP3PInst<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag,
+ bit HasExplicitClamp = 0> :
VOP3P_Pseudo<OpName, P,
- !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+ !if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret)
>;
// Non-packed instructions that use the VOP3P encoding.
@@ -29,9 +31,14 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
!con(
(ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
- FP16InputMods:$src2_modifiers, VCSrc_f16:$src2,
- clampmod:$clamp),
- !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))),
+ FP16InputMods:$src2_modifiers, VCSrc_f16:$src2),
+ // FIXME: clampmod0 misbehaves with the non-default vdst_in
+ // following it. For now workaround this by requiring clamp
+ // in tied patterns. This should use undef_tied_input, but it
+ // seems underdeveloped and doesn't apply the right register
+ // class constraints.
+ !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
+ (ins clampmod0:$clamp))),
(ins op_sel:$op_sel, op_sel_hi:$op_sel_hi));
let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
@@ -45,9 +52,9 @@ def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_
def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
let FPDPRounding = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
-def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
-def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
} // End FPDPRounding = 1
def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
@@ -75,8 +82,8 @@ def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I1
// The constant will be emitted as a mov, and folded later.
// TODO: We could directly encode the immediate now
def : GCNPat<
- (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1),
- (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp)
+ (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1),
+ (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
>;
multiclass MadFmaMixPats<SDPatternOperator fma_like,
@@ -142,10 +149,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
}
let SubtargetPredicate = HasMadMixInsts in {
+
// These are VOP3a-like opcodes which accept no omod.
// Size of src arguments (16/32) is controlled by op_sel.
// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
-let isCommutable = 1 in {
+let isCommutable = 1, mayRaiseFPException = 0 in {
def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
let FPDPRounding = 1 in {
@@ -203,7 +211,7 @@ foreach Type = ["I", "U"] in
foreach Index = 0-3 in {
// Defines patterns that extract each Index'ed 8bit from an unsigned
// 32bit scalar value;
- def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
+ def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
// Defines multiplication patterns where the multiplication is happening on each
// Index'ed 8bit of a 32bit scalar value.
@@ -211,8 +219,8 @@ foreach Type = ["I", "U"] in
def Mul#Type#_Elt#Index : PatFrag<
(ops node:$src0, node:$src1),
(!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse))
- (!cast<Extract>(#Type#Index#"_8bit") node:$src0),
- (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>;
+ (!cast<Extract>(Type#Index#"_8bit") node:$src0),
+ (!cast<Extract>(Type#Index#"_8bit") node:$src1))>;
}
// Different variants of dot8 patterns cause a huge increase in the compile time.
@@ -231,15 +239,15 @@ foreach Type = ["I", "U"] in
foreach Index = 0-7 in {
// Defines patterns that extract each Index'ed 4bit from an unsigned
// 32bit scalar value;
- def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
+ def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
// Defines multiplication patterns where the multiplication is happening on each
// Index'ed 8bit of a 32bit scalar value.
def Mul#Type#Index#"_4bit" : PatFrag<
(ops node:$src0, node:$src1),
(!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse))
- (!cast<Extract>(#Type#Index#"_4bit") node:$src0),
- (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>;
+ (!cast<Extract>(Type#Index#"_4bit") node:$src0),
+ (!cast<Extract>(Type#Index#"_4bit") node:$src1))>;
}
class UDot2Pat<Instruction Inst> : GCNPat <
@@ -264,40 +272,30 @@ class SDot2Pat<Instruction Inst> : GCNPat <
let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
-def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
-def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
+ VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+ AMDGPUfdot2, 1/*ExplicitClamp*/>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
+ VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
+ VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
+ VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
+ VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
} // End SubtargetPredicate = HasDot2Insts
let SubtargetPredicate = HasDot1Insts in {
-def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
-def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
+ VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
+ VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
} // End SubtargetPredicate = HasDot1Insts
} // End let IsDOT = 1
-multiclass DotPats<SDPatternOperator dot_op,
- VOP3PInst dot_inst> {
- let SubtargetPredicate = dot_inst.SubtargetPredicate in
- def : GCNPat <
- (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
- (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
- (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
- (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
-}
-
-defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
-defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
-defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
-defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
-defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
-defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
-defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
-
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
@@ -368,12 +366,16 @@ def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, A
def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
let Predicates = [HasMAIInsts] in {
+
+let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
let isMoveImm = 1;
}
+}
-let isConvergent = 1 in {
+// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
+let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>;
def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>;
def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>;
@@ -394,7 +396,7 @@ def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I3
def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>;
def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
-} // End isConvergent = 1
+} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
} // End SubtargetPredicate = HasMAIInsts
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 39d18794f947b..aa2fa260e7b52 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1,4 +1,4 @@
-//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOPCInstructions.td - Vector Instruction Definitions --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -92,9 +92,11 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
let mayStore = 0;
let hasSideEffects = 0;
+ let ReadsModeReg = isFloatType<P.Src0VT>.ret;
+
let VALU = 1;
let VOPC = 1;
- let Uses = [EXEC];
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let Defs = !if(DefVcc, [VCC], []);
VOPProfile Pfl = P;
@@ -738,6 +740,9 @@ multiclass VOPC_CLASS_F64 <string opName> :
multiclass VOPCX_CLASS_F64 <string opName> :
VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
+// cmp_class ignores the FP mode and faithfully reports the unmodified
+// source value.
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
@@ -747,6 +752,7 @@ let SubtargetPredicate = Has16BitInsts in {
defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">;
defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
}
+} // End ReadsModeReg = 0, mayRaiseFPException = 0
//===----------------------------------------------------------------------===//
// V_ICMPIntrinsic Pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f208a1134a5a4..f8a83e5f74c0b 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1,4 +1,4 @@
-//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
+//===-- VOPInstructions.td - Vector Instruction Definitions ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -8,6 +8,8 @@
// dummies for outer let
class LetDummies {
+ bit ReadsModeReg;
+ bit mayRaiseFPException;
bit isCommutable;
bit isConvertibleToThreeAddress;
bit isMoveImm;
@@ -35,7 +37,7 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
let VALU = 1;
- let Uses = [EXEC];
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
}
class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
@@ -118,7 +120,10 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let ClampLo = P.HasClampLo;
let ClampHi = P.HasClampHi;
- let Uses = [EXEC];
+ let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+ let mayRaiseFPException = ReadsModeReg;
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let AsmVariantName = AMDGPUAsmVariants.VOP3;
let AsmMatchConverter =
@@ -160,7 +165,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
VOPProfile Pfl = ps.Pfl;
}
-// XXX - Is there any reason to distingusih this from regular VOP3
+// XXX - Is there any reason to distinguish this from regular VOP3
// here?
class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
VOP3_Real<ps, EncodingFamily>;
@@ -490,10 +495,14 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
let VALU = 1;
let SDWA = 1;
- let Uses = [EXEC];
- let SubtargetPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
- let AssemblerPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
+ let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+ let mayRaiseFPException = ReadsModeReg;
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
+
+ let SubtargetPredicate = HasSDWA;
+ let AssemblerPredicate = HasSDWA;
let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
AMDGPUAsmVariants.Disable);
let DecoderNamespace = "SDWA";
@@ -542,8 +551,8 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
- let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
- let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
+ let SubtargetPredicate = HasSDWA9;
+ let AssemblerPredicate = HasSDWA9;
let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
AMDGPUAsmVariants.Disable);
let DecoderNamespace = "SDWA9";
@@ -561,8 +570,8 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>;
class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> {
- let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
- let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
+ let SubtargetPredicate = HasSDWA10;
+ let AssemblerPredicate = HasSDWA10;
let DecoderNamespace = "SDWA10";
}
@@ -607,7 +616,11 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let VALU = 1;
let DPP = 1;
let Size = 8;
- let Uses = [EXEC];
+
+ let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+ let mayRaiseFPException = ReadsModeReg;
+ let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
let isConvergent = 1;
string Mnemonic = OpName;
@@ -615,7 +628,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+ let AssemblerPredicate = HasDPP;
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -670,7 +683,7 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
let SubtargetPredicate = HasDPP;
- let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+ let AssemblerPredicate = HasDPP;
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -702,7 +715,7 @@ class VOP_DPP8<string OpName, VOPProfile P> :
let AsmMatchConverter = "cvtDPP8";
let SubtargetPredicate = HasDPP8;
- let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst);
+ let AssemblerPredicate = HasDPP8;
let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");